Libraries

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggthemes)
library(caTools)
library(corrplot)
## corrplot 0.92 loaded
library(corrgram)
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
## 
##     panel.fill
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(rpart)
library(rpart.plot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
library(class)
library(missForest)
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(performance)
library(DataExplorer)
library(lattice)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(ROSE)
## Loaded ROSE 0.0-4
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   1.0.1      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()       masks gridExtra::combine(), randomForest::combine()
## ✖ dplyr::filter()        masks mice::filter(), stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::src()           masks Hmisc::src()
## ✖ dplyr::summarize()     masks Hmisc::summarize()
library(xgboost)
## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
library(officer)
library(vipor)
library(ggplot2)
library(ggbeeswarm)

Import data

data = read.csv("diabetic_data.csv", na.string = "?", header = T, sep = ",")
head(data)
##   encounter_id patient_nbr            race gender     age weight
## 1      2278392     8222157       Caucasian Female  [0-10)   <NA>
## 2       149190    55629189       Caucasian Female [10-20)   <NA>
## 3        64410    86047875 AfricanAmerican Female [20-30)   <NA>
## 4       500364    82442376       Caucasian   Male [30-40)   <NA>
## 5        16680    42519267       Caucasian   Male [40-50)   <NA>
## 6        35754    82637451       Caucasian   Male [50-60)   <NA>
##   admission_type_id discharge_disposition_id admission_source_id
## 1                 6                       25                   1
## 2                 1                        1                   7
## 3                 1                        1                   7
## 4                 1                        1                   7
## 5                 1                        1                   7
## 6                 2                        1                   2
##   time_in_hospital payer_code        medical_specialty num_lab_procedures
## 1                1       <NA> Pediatrics-Endocrinology                 41
## 2                3       <NA>                     <NA>                 59
## 3                2       <NA>                     <NA>                 11
## 4                2       <NA>                     <NA>                 44
## 5                1       <NA>                     <NA>                 51
## 6                3       <NA>                     <NA>                 31
##   num_procedures num_medications number_outpatient number_emergency
## 1              0               1                 0                0
## 2              0              18                 0                0
## 3              5              13                 2                0
## 4              1              16                 0                0
## 5              0               8                 0                0
## 6              6              16                 0                0
##   number_inpatient diag_1 diag_2 diag_3 number_diagnoses max_glu_serum
## 1                0 250.83   <NA>   <NA>                1          None
## 2                0    276 250.01    255                9          None
## 3                1    648    250    V27                6          None
## 4                0      8 250.43    403                7          None
## 5                0    197    157    250                5          None
## 6                0    414    411    250                9          None
##   A1Cresult metformin repaglinide nateglinide chlorpropamide glimepiride
## 1      None        No          No          No             No          No
## 2      None        No          No          No             No          No
## 3      None        No          No          No             No          No
## 4      None        No          No          No             No          No
## 5      None        No          No          No             No          No
## 6      None        No          No          No             No          No
##   acetohexamide glipizide glyburide tolbutamide pioglitazone rosiglitazone
## 1            No        No        No          No           No            No
## 2            No        No        No          No           No            No
## 3            No    Steady        No          No           No            No
## 4            No        No        No          No           No            No
## 5            No    Steady        No          No           No            No
## 6            No        No        No          No           No            No
##   acarbose miglitol troglitazone tolazamide examide citoglipton insulin
## 1       No       No           No         No      No          No      No
## 2       No       No           No         No      No          No      Up
## 3       No       No           No         No      No          No      No
## 4       No       No           No         No      No          No      Up
## 5       No       No           No         No      No          No  Steady
## 6       No       No           No         No      No          No  Steady
##   glyburide.metformin glipizide.metformin glimepiride.pioglitazone
## 1                  No                  No                       No
## 2                  No                  No                       No
## 3                  No                  No                       No
## 4                  No                  No                       No
## 5                  No                  No                       No
## 6                  No                  No                       No
##   metformin.rosiglitazone metformin.pioglitazone change diabetesMed readmitted
## 1                      No                     No     No          No         NO
## 2                      No                     No     Ch         Yes        >30
## 3                      No                     No     No         Yes         NO
## 4                      No                     No     Ch         Yes         NO
## 5                      No                     No     Ch         Yes         NO
## 6                      No                     No     No         Yes        >30

Data Cleaning

#Structure

str(data)
## 'data.frame':    101766 obs. of  50 variables:
##  $ encounter_id            : int  2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
##  $ patient_nbr             : int  8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
##  $ race                    : chr  "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
##  $ gender                  : chr  "Female" "Female" "Female" "Male" ...
##  $ age                     : chr  "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
##  $ weight                  : chr  NA NA NA NA ...
##  $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
##  $ payer_code              : chr  NA NA NA NA ...
##  $ medical_specialty       : chr  "Pediatrics-Endocrinology" NA NA NA ...
##  $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : int  0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : int  1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : chr  "250.83" "276" "648" "8" ...
##  $ diag_2                  : chr  NA "250.01" "250" "250.43" ...
##  $ diag_3                  : chr  NA "255" "V27" "403" ...
##  $ number_diagnoses        : int  1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : chr  "None" "None" "None" "None" ...
##  $ A1Cresult               : chr  "None" "None" "None" "None" ...
##  $ metformin               : chr  "No" "No" "No" "No" ...
##  $ repaglinide             : chr  "No" "No" "No" "No" ...
##  $ nateglinide             : chr  "No" "No" "No" "No" ...
##  $ chlorpropamide          : chr  "No" "No" "No" "No" ...
##  $ glimepiride             : chr  "No" "No" "No" "No" ...
##  $ acetohexamide           : chr  "No" "No" "No" "No" ...
##  $ glipizide               : chr  "No" "No" "Steady" "No" ...
##  $ glyburide               : chr  "No" "No" "No" "No" ...
##  $ tolbutamide             : chr  "No" "No" "No" "No" ...
##  $ pioglitazone            : chr  "No" "No" "No" "No" ...
##  $ rosiglitazone           : chr  "No" "No" "No" "No" ...
##  $ acarbose                : chr  "No" "No" "No" "No" ...
##  $ miglitol                : chr  "No" "No" "No" "No" ...
##  $ troglitazone            : chr  "No" "No" "No" "No" ...
##  $ tolazamide              : chr  "No" "No" "No" "No" ...
##  $ examide                 : chr  "No" "No" "No" "No" ...
##  $ citoglipton             : chr  "No" "No" "No" "No" ...
##  $ insulin                 : chr  "No" "Up" "No" "Up" ...
##  $ glyburide.metformin     : chr  "No" "No" "No" "No" ...
##  $ glipizide.metformin     : chr  "No" "No" "No" "No" ...
##  $ glimepiride.pioglitazone: chr  "No" "No" "No" "No" ...
##  $ metformin.rosiglitazone : chr  "No" "No" "No" "No" ...
##  $ metformin.pioglitazone  : chr  "No" "No" "No" "No" ...
##  $ change                  : chr  "No" "Ch" "No" "Ch" ...
##  $ diabetesMed             : chr  "No" "Yes" "Yes" "Yes" ...
##  $ readmitted              : chr  "NO" ">30" "NO" "NO" ...

#Feature list and corresponding feature numbers

col_numbers <- cbind(colnames(data))
col_numbers
##       [,1]                      
##  [1,] "encounter_id"            
##  [2,] "patient_nbr"             
##  [3,] "race"                    
##  [4,] "gender"                  
##  [5,] "age"                     
##  [6,] "weight"                  
##  [7,] "admission_type_id"       
##  [8,] "discharge_disposition_id"
##  [9,] "admission_source_id"     
## [10,] "time_in_hospital"        
## [11,] "payer_code"              
## [12,] "medical_specialty"       
## [13,] "num_lab_procedures"      
## [14,] "num_procedures"          
## [15,] "num_medications"         
## [16,] "number_outpatient"       
## [17,] "number_emergency"        
## [18,] "number_inpatient"        
## [19,] "diag_1"                  
## [20,] "diag_2"                  
## [21,] "diag_3"                  
## [22,] "number_diagnoses"        
## [23,] "max_glu_serum"           
## [24,] "A1Cresult"               
## [25,] "metformin"               
## [26,] "repaglinide"             
## [27,] "nateglinide"             
## [28,] "chlorpropamide"          
## [29,] "glimepiride"             
## [30,] "acetohexamide"           
## [31,] "glipizide"               
## [32,] "glyburide"               
## [33,] "tolbutamide"             
## [34,] "pioglitazone"            
## [35,] "rosiglitazone"           
## [36,] "acarbose"                
## [37,] "miglitol"                
## [38,] "troglitazone"            
## [39,] "tolazamide"              
## [40,] "examide"                 
## [41,] "citoglipton"             
## [42,] "insulin"                 
## [43,] "glyburide.metformin"     
## [44,] "glipizide.metformin"     
## [45,] "glimepiride.pioglitazone"
## [46,] "metformin.rosiglitazone" 
## [47,] "metformin.pioglitazone"  
## [48,] "change"                  
## [49,] "diabetesMed"             
## [50,] "readmitted"

Handling missing data

any(is.na(data))
## [1] TRUE

#plot missing data

plot_missing(data)

#Remove variables with significant missing values, non-useful variables

#removing variables: 
#encounter id, pt number: identifications not required. 
#weight, payor code: significant missing values 
# medical specialty: unreasonable number of levels.
data <- data %>% select(-encounter_id, -patient_nbr, -weight, -payer_code, -medical_specialty)

#removing all medications 
data <- data[, -(20:42)]

#check structure after removal

str(data)
## 'data.frame':    101766 obs. of  22 variables:
##  $ race                    : chr  "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
##  $ gender                  : chr  "Female" "Female" "Female" "Male" ...
##  $ age                     : chr  "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
##  $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
##  $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
##  $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
##  $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
##  $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
##  $ num_procedures          : int  0 0 5 1 0 6 1 0 2 3 ...
##  $ num_medications         : int  1 18 13 16 8 16 21 12 28 18 ...
##  $ number_outpatient       : int  0 0 2 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ diag_1                  : chr  "250.83" "276" "648" "8" ...
##  $ diag_2                  : chr  NA "250.01" "250" "250.43" ...
##  $ diag_3                  : chr  NA "255" "V27" "403" ...
##  $ number_diagnoses        : int  1 9 6 7 5 9 7 8 8 8 ...
##  $ max_glu_serum           : chr  "None" "None" "None" "None" ...
##  $ A1Cresult               : chr  "None" "None" "None" "None" ...
##  $ change                  : chr  "No" "Ch" "No" "Ch" ...
##  $ diabetesMed             : chr  "No" "Yes" "Yes" "Yes" ...
##  $ readmitted              : chr  "NO" ">30" "NO" "NO" ...

#check for NAs in retained variables

any(is.na(data))
## [1] TRUE

#plot missing data

plot_missing(data)

#remove observations with NAs

data <- na.omit(data)

#check

plot_missing(data)

Data Conversion

#all chr variables to factor

data[] <- lapply(data, function(x) if(is.character(x)) as.factor(x) else x)

#variables admission_type_id, admission_type_id, admission_source_id to factor

data$discharge_disposition_id <- as.factor(data$discharge_disposition_id)
data$admission_type_id <- as.factor(data$admission_type_id)
data$admission_source_id <- as.factor(data$admission_source_id)

#check conversion

str(data)
## 'data.frame':    98053 obs. of  22 variables:
##  $ race                    : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 3 3 3 3 3 3 1 ...
##  $ gender                  : Factor w/ 3 levels "Female","Male",..: 1 1 2 2 2 2 2 1 1 1 ...
##  $ age                     : Factor w/ 10 levels "[0-10)","[10-20)",..: 2 3 4 5 6 7 8 9 10 5 ...
##  $ admission_type_id       : Factor w/ 8 levels "1","2","3","4",..: 1 1 1 1 2 3 1 2 3 1 ...
##  $ discharge_disposition_id: Factor w/ 26 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 3 1 ...
##  $ admission_source_id     : Factor w/ 17 levels "1","2","3","4",..: 7 7 7 7 2 2 7 4 4 7 ...
##  $ time_in_hospital        : int  3 2 2 1 3 4 5 13 12 9 ...
##  $ num_lab_procedures      : int  59 11 44 51 31 70 73 68 33 47 ...
##  $ num_procedures          : int  0 5 1 0 6 1 0 2 3 2 ...
##  $ num_medications         : int  18 13 16 8 16 21 12 28 18 17 ...
##  $ number_outpatient       : int  0 2 0 0 0 0 0 0 0 0 ...
##  $ number_emergency        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient        : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ diag_1                  : Factor w/ 713 levels "10","11","110",..: 144 455 554 55 264 264 277 253 283 121 ...
##  $ diag_2                  : Factor w/ 740 levels "11","110","112",..: 78 77 96 24 245 245 313 259 46 240 ...
##  $ diag_3                  : Factor w/ 786 levels "11","110","111",..: 122 764 249 87 87 768 87 230 317 666 ...
##  $ number_diagnoses        : int  9 6 7 5 9 7 8 8 8 9 ...
##  $ max_glu_serum           : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ A1Cresult               : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ change                  : Factor w/ 2 levels "Ch","No": 1 2 1 1 2 1 2 1 1 2 ...
##  $ diabetesMed             : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ readmitted              : Factor w/ 3 levels "<30",">30","NO": 2 3 3 3 2 3 2 3 3 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:3713] 1 20 21 22 55 66 67 88 100 112 ...
##   ..- attr(*, "names")= chr [1:3713] "1" "20" "21" "22" ...

Re-factor response variable

#Function to refactor

binary = function(x){
  if(x == ">30" | x == "<30"){
    return("Yes")
  }else{return("No")}
}

#apply fucntion to response variable

data$readmitted <- sapply(data[, c('readmitted')], function(x) binary(x)) 

#check reponse variable levels

table(data$readmitted)
## 
##    No   Yes 
## 52338 45715

#response variable to factor

data$readmitted <- as.factor(data$readmitted)

#check response variable type

class(data$readmitted)
## [1] "factor"

Remove invalid observations

#Remove "None" observations for A1C and max glucose serum. None = not measured
#Remove Unknown/Invalid observations for gender variable 

data = subset(data, gender!= "Unknown/Invalid", )
data = subset(data, A1Cresult != "None", )             
data = subset(data, max_glu_serum != "None", )
data = droplevels(data)

#Check if “None” observations removed

table(data$gender)
## 
## Female   Male 
##    168    121
table(data$A1Cresult)
## 
##   >7   >8 Norm 
##   63  171   55
table(data$max_glu_serum)
## 
## >200 >300 Norm 
##   69  124   96

Dummy Variables

#Creating dummy variables

dummies <- predict(dummyVars(~race + gender + age + admission_type_id + discharge_disposition_id + admission_source_id + max_glu_serum + A1Cresult + change + diabetesMed, data = data), newdata = data)

#check if dummies created

head(dummies)
##     race.AfricanAmerican race.Asian race.Caucasian race.Hispanic race.Other
## 163                    0          0              1             0          0
## 461                    1          0              0             0          0
## 594                    0          0              1             0          0
## 697                    0          0              0             0          1
## 772                    0          0              1             0          0
## 824                    0          0              1             0          0
##     gender.Female gender.Male age.[10-20) age.[20-30) age.[30-40) age.[40-50)
## 163             0           1           0           0           0           0
## 461             1           0           0           0           0           0
## 594             1           0           0           0           0           0
## 697             0           1           0           0           0           0
## 772             1           0           0           0           1           0
## 824             0           1           0           0           0           0
##     age.[50-60) age.[60-70) age.[70-80) age.[80-90) admission_type_id.1
## 163           0           0           0           1                   0
## 461           0           0           1           0                   0
## 594           1           0           0           0                   0
## 697           0           0           1           0                   0
## 772           0           0           0           0                   0
## 824           0           0           0           1                   0
##     admission_type_id.2 admission_type_id.3 admission_type_id.6
## 163                   0                   0                   1
## 461                   0                   0                   1
## 594                   0                   0                   1
## 697                   0                   0                   1
## 772                   0                   0                   1
## 824                   0                   0                   1
##     discharge_disposition_id.1 discharge_disposition_id.2
## 163                          0                          0
## 461                          1                          0
## 594                          1                          0
## 697                          0                          0
## 772                          1                          0
## 824                          1                          0
##     discharge_disposition_id.3 discharge_disposition_id.5
## 163                          1                          0
## 461                          0                          0
## 594                          0                          0
## 697                          0                          0
## 772                          0                          0
## 824                          0                          0
##     discharge_disposition_id.6 discharge_disposition_id.7
## 163                          0                          0
## 461                          0                          0
## 594                          0                          0
## 697                          1                          0
## 772                          0                          0
## 824                          0                          0
##     discharge_disposition_id.10 discharge_disposition_id.11
## 163                           0                           0
## 461                           0                           0
## 594                           0                           0
## 697                           0                           0
## 772                           0                           0
## 824                           0                           0
##     discharge_disposition_id.13 admission_source_id.1 admission_source_id.2
## 163                           0                     0                     0
## 461                           0                     0                     0
## 594                           0                     0                     0
## 697                           0                     0                     0
## 772                           0                     0                     1
## 824                           0                     0                     0
##     admission_source_id.7 max_glu_serum.>200 max_glu_serum.>300
## 163                     1                  1                  0
## 461                     1                  0                  1
## 594                     1                  0                  1
## 697                     1                  1                  0
## 772                     0                  0                  0
## 824                     1                  0                  1
##     max_glu_serum.Norm A1Cresult.>7 A1Cresult.>8 A1Cresult.Norm change.Ch
## 163                  0            0            0              1         0
## 461                  0            0            1              0         1
## 594                  0            0            1              0         0
## 697                  0            1            0              0         0
## 772                  1            1            0              0         0
## 824                  0            1            0              0         0
##     change.No diabetesMed.No diabetesMed.Yes
## 163         1              1               0
## 461         0              0               1
## 594         1              0               1
## 697         1              0               1
## 772         1              1               0
## 824         1              0               1

#rename columns

colnames(dummies)
##  [1] "race.AfricanAmerican"        "race.Asian"                 
##  [3] "race.Caucasian"              "race.Hispanic"              
##  [5] "race.Other"                  "gender.Female"              
##  [7] "gender.Male"                 "age.[10-20)"                
##  [9] "age.[20-30)"                 "age.[30-40)"                
## [11] "age.[40-50)"                 "age.[50-60)"                
## [13] "age.[60-70)"                 "age.[70-80)"                
## [15] "age.[80-90)"                 "admission_type_id.1"        
## [17] "admission_type_id.2"         "admission_type_id.3"        
## [19] "admission_type_id.6"         "discharge_disposition_id.1" 
## [21] "discharge_disposition_id.2"  "discharge_disposition_id.3" 
## [23] "discharge_disposition_id.5"  "discharge_disposition_id.6" 
## [25] "discharge_disposition_id.7"  "discharge_disposition_id.10"
## [27] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [29] "admission_source_id.1"       "admission_source_id.2"      
## [31] "admission_source_id.7"       "max_glu_serum.>200"         
## [33] "max_glu_serum.>300"          "max_glu_serum.Norm"         
## [35] "A1Cresult.>7"                "A1Cresult.>8"               
## [37] "A1Cresult.Norm"              "change.Ch"                  
## [39] "change.No"                   "diabetesMed.No"             
## [41] "diabetesMed.Yes"
colnames(dummies)[colnames(dummies) == "A1Cresult.>7"] <- "A1Cresult_7"
colnames(dummies)[colnames(dummies) == "A1Cresult.>8"] <- "A1Cresult_8"
colnames(dummies)[colnames(dummies) == "A1Cresult.Norm"] <- "A1Cresult_Norm"
colnames(dummies)[colnames(dummies) == "max_glu_serum.>200"] <- "max_glu_serum_200"
colnames(dummies)[colnames(dummies) == "max_glu_serum.>300"] <- "max_glu_serum_300"
colnames(dummies)[colnames(dummies) == "max_glu_serum.Norm"] <- "max_glu_serum_Norm"
colnames(dummies)[colnames(dummies) == "change.Ch"] <- "med_change_yes"
colnames(dummies)[colnames(dummies) == "change.No"] <- "med_change_no"
colnames(dummies)[colnames(dummies) == "diabetesMed.No"] <- "diabetes_med_yes"
colnames(dummies)[colnames(dummies) == "diabetesMed.Yes"] <- "diabetes_med_no"
colnames(dummies)[colnames(dummies) == "age.[10-20)"] <- "age_10_20 "
colnames(dummies)[colnames(dummies) == "age.[20-30)"] <- "age_20_30"
colnames(dummies)[colnames(dummies) == "age.[30-40)"] <- "age_30_40"
colnames(dummies)[colnames(dummies) == "age.[40-50)"] <- "age_40_50"
colnames(dummies)[colnames(dummies) == "age.[50-60)"] <- "age_50_60"
colnames(dummies)[colnames(dummies) == "age.[60-70)"] <- "age_60_70"
colnames(dummies)[colnames(dummies) == "age.[70-80)"] <- "age_70_80"
colnames(dummies)[colnames(dummies) == "age.[80-90)"] <- "age_80_90"
colnames(dummies)[colnames(dummies) == "race.AfricanAmerican"] <- "race_AfricanAmerican"
colnames(dummies)[colnames(dummies) == "race.Asian"] <- "race_Asian"
colnames(dummies)[colnames(dummies) == "race.Caucasian"] <- "race_Caucasian"
colnames(dummies)[colnames(dummies) == "race.Hispanic"] <- "race_Hispanic"
colnames(dummies)[colnames(dummies) == "race.Other"] <- "race_Other"

#check column name chages

colnames(dummies)
##  [1] "race_AfricanAmerican"        "race_Asian"                 
##  [3] "race_Caucasian"              "race_Hispanic"              
##  [5] "race_Other"                  "gender.Female"              
##  [7] "gender.Male"                 "age_10_20 "                 
##  [9] "age_20_30"                   "age_30_40"                  
## [11] "age_40_50"                   "age_50_60"                  
## [13] "age_60_70"                   "age_70_80"                  
## [15] "age_80_90"                   "admission_type_id.1"        
## [17] "admission_type_id.2"         "admission_type_id.3"        
## [19] "admission_type_id.6"         "discharge_disposition_id.1" 
## [21] "discharge_disposition_id.2"  "discharge_disposition_id.3" 
## [23] "discharge_disposition_id.5"  "discharge_disposition_id.6" 
## [25] "discharge_disposition_id.7"  "discharge_disposition_id.10"
## [27] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [29] "admission_source_id.1"       "admission_source_id.2"      
## [31] "admission_source_id.7"       "max_glu_serum_200"          
## [33] "max_glu_serum_300"           "max_glu_serum_Norm"         
## [35] "A1Cresult_7"                 "A1Cresult_8"                
## [37] "A1Cresult_Norm"              "med_change_yes"             
## [39] "med_change_no"               "diabetes_med_yes"           
## [41] "diabetes_med_no"

#merge dummy variables for and original dataset

data <- cbind(data,dummies)

#check merging

str(data)
## 'data.frame':    289 obs. of  63 variables:
##  $ race                       : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
##  $ gender                     : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
##  $ age                        : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
##  $ admission_type_id          : Factor w/ 4 levels "1","2","3","6": 4 4 4 4 4 4 4 4 4 4 ...
##  $ discharge_disposition_id   : Factor w/ 9 levels "1","2","3","5",..: 3 1 1 5 1 1 1 1 1 7 ...
##  $ admission_source_id        : Factor w/ 3 levels "1","2","7": 3 3 3 3 2 3 3 3 3 1 ...
##  $ time_in_hospital           : int  5 10 2 11 14 7 2 3 2 4 ...
##  $ num_lab_procedures         : int  47 72 61 71 43 105 66 76 43 41 ...
##  $ num_procedures             : int  1 1 0 1 0 3 0 0 0 1 ...
##  $ num_medications            : int  6 19 5 20 11 16 3 9 13 8 ...
##  $ number_outpatient          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1                     : Factor w/ 90 levels "112","162","188",..: 24 5 19 85 22 36 6 21 71 74 ...
##  $ diag_2                     : Factor w/ 103 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
##  $ diag_3                     : Factor w/ 103 levels "198","208","211",..: 46 28 9 99 65 21 19 31 92 6 ...
##  $ number_diagnoses           : int  5 5 5 5 3 5 3 5 5 3 ...
##  $ max_glu_serum              : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
##  $ A1Cresult                  : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
##  $ change                     : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
##  $ diabetesMed                : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
##  $ readmitted                 : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 2 2 2 2 ...
##  $ race_AfricanAmerican       : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ race_Asian                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Caucasian             : num  1 0 1 0 1 1 0 1 1 1 ...
##  $ race_Hispanic              : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ race_Other                 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ gender.Female              : num  0 1 1 0 1 0 1 1 1 0 ...
##  $ gender.Male                : num  1 0 0 1 0 1 0 0 0 1 ...
##  $ age_10_20                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_20_30                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_30_40                  : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ age_40_50                  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ age_50_60                  : num  0 0 1 0 0 0 1 0 1 1 ...
##  $ age_60_70                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_70_80                  : num  0 1 0 1 0 0 0 0 0 0 ...
##  $ age_80_90                  : num  1 0 0 0 0 1 0 0 0 0 ...
##  $ admission_type_id.1        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.2        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.3        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.6        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ discharge_disposition_id.1 : num  0 1 1 0 1 1 1 1 1 0 ...
##  $ discharge_disposition_id.2 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.3 : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.5 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.6 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.7 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.10: num  0 0 0 0 0 0 0 0 0 1 ...
##  $ discharge_disposition_id.11: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.13: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.1      : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ admission_source_id.2      : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ admission_source_id.7      : num  1 1 1 1 0 1 1 1 1 0 ...
##  $ max_glu_serum_200          : num  1 0 0 1 0 0 0 0 0 1 ...
##  $ max_glu_serum_300          : num  0 1 1 0 0 1 0 1 1 0 ...
##  $ max_glu_serum_Norm         : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ A1Cresult_7                : num  0 0 0 1 1 1 1 1 1 0 ...
##  $ A1Cresult_8                : num  0 1 1 0 0 0 0 0 0 1 ...
##  $ A1Cresult_Norm             : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ med_change_yes             : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ med_change_no              : num  1 0 1 1 1 1 1 0 1 1 ...
##  $ diabetes_med_yes           : num  1 0 0 0 1 0 0 0 1 0 ...
##  $ diabetes_med_no            : num  0 1 1 1 0 1 1 1 0 1 ...

Bucketing and dummies of diag_1, diag_2 and diag_3

#creating new object for diag1, 2 & 3 variables and initiating categories for them

diag_data <- data

diag_data$Circulatory_diag <- 0
diag_data$Respiraoty_diag <- 0
diag_data$Digestive_diag <- 0
diag_data$Diabetes_diag <- 0
diag_data$Injury_diag <- 0
diag_data$Musktl_diag <-0
diag_data$Genitourinary_diag <- 0
diag_data$Neoplasm_diag <-0
diag_data$Other_diag <- 0

#Circulatory

diag_data$Circulatory_diag[(as.character( diag_data$diag_1) >= "390" & as.character( diag_data$diag_1) <= "459" |  as.character( diag_data$diag_1 ) == "785")
                   | (as.character( diag_data$diag_2) >= "390" & as.character( diag_data$diag_2) <= "459" |  as.character( diag_data$diag_2 ) == "785")
                   | (as.character( diag_data$diag_3) >= "390" & as.character( diag_data$diag_3) <= "459" |  as.character( diag_data$diag_3 ) == "785")] <- 1

#Diabetes

diag_data$Diabetes_diag[(as.character(diag_data$diag_1) > "249" & as.character(diag_data$diag_1) < "251")
                   | (as.character(diag_data$diag_2) > "249" & as.character(diag_data$diag_2) < "251")
                   | (as.character(diag_data$diag_3) > "249" & as.character(diag_data$diag_3) < "251")] <- 1

#Respiratory

diag_data$Respiraoty_diag[(as.character( diag_data$diag_1) >= "460" & as.character( diag_data$diag_1) <= "519" |  as.character( diag_data$diag_1 ) == "786")
                   | (as.character( diag_data$diag_2) >= "460" & as.character( diag_data$diag_2) <= "519" |  as.character( diag_data$diag_2 ) == "786")
                   | (as.character( diag_data$diag_3) >= "460" & as.character( diag_data$diag_3) <= "519" |  as.character( diag_data$diag_3 ) == "786")] <- 1

#Digestive

diag_data$Digestive_diag[(as.character( diag_data$diag_1) >= "520" & as.character( diag_data$diag_1) <= "579" |  as.character( diag_data$diag_1 ) == "787")
                  | (as.character( diag_data$diag_2) >= "520" & as.character( diag_data$diag_2) <= "579" |  as.character( diag_data$diag_2 ) == "787")
                  | (as.character( diag_data$diag_3) >= "520" & as.character( diag_data$diag_3) <= "579" |  as.character( diag_data$diag_3 ) == "787")] <- 1

#Injury

diag_data$Injury_diag[(as.character( diag_data$diag_1) >= "800" & as.character( diag_data$diag_1) <= "999")
                  | (as.character( diag_data$diag_2) >= "800" & as.character( diag_data$diag_2) <= "999")
                  | (as.character( diag_data$diag_3) >= "800" & as.character( diag_data$diag_3) <= "999")] <- 1

#Musculoskeletal

diag_data$Musktl_diag[(as.character( diag_data$diag_1) >= "710" & as.character( diag_data$diag_1) <= "739")
                   | (as.character( diag_data$diag_2) >= "710" & as.character( diag_data$diag_2) <= "739")
                   | (as.character( diag_data$diag_3) >= "710" & as.character( diag_data$diag_3) <= "739")] <- 1

#Genitourinary

diag_data$Genitourinary_diag[(as.character( diag_data$diag_1) >= "580" & as.character( diag_data$diag_1) <= "629" |  as.character( diag_data$diag_1 ) == "788")
                   | (as.character( diag_data$diag_2) >= "580" & as.character( diag_data$diag_2) <= "629" |  as.character( diag_data$diag_2 ) == "788")
                   | (as.character( diag_data$diag_3) >= "580" & as.character( diag_data$diag_3) <= "629" |  as.character( diag_data$diag_3 ) == "788")] <- 1

#Neoplasms

diag_data$Neoplasm_diag[(as.character( diag_data$diag_1) >= "140" & as.character( diag_data$diag_1) <= "239")
                   | (as.character( diag_data$diag_2) >= "140" & as.character( diag_data$diag_2) <= "239")
                   | (as.character( diag_data$diag_3) >= "140" & as.character( diag_data$diag_3) <= "239")] <- 1

#Other diagnoses

diag_data$Other_diag[(as.character( diag_data$diag_1) == "780") | (as.character( diag_data$diag_1) == "781")
                    | (as.character( diag_data$diag_1) == "784") | (as.character( diag_data$diag_1) >= "790" & as.character( diag_data$diag_1) <= "799")
                    | (as.character( diag_data$diag_1) >= "240" & as.character( diag_data$diag_1) <= "249") | (as.character( diag_data$diag_1) >= "251" & as.character( diag_data$diag_1) <= "279")
                    | (as.character( diag_data$diag_1) >= "680" & as.character( diag_data$diag_1) <= "709") | (as.character( diag_data$diag_1) == "782") 
                    | (as.character( diag_data$diag_1) >= "001" & as.character( diag_data$diag_1) <= "139") | (as.character( diag_data$diag_1) >= "290" & as.character( diag_data$diag_1) <= "319")
                    | (as.character( diag_data$diag_1) >= "280" & as.character( diag_data$diag_1) <= "289") | (as.character( diag_data$diag_1) >= "320" & as.character( diag_data$diag_1) <= "359")
                    | (as.character( diag_data$diag_1) >= "630" & as.character( diag_data$diag_1) <= "679") | (as.character( diag_data$diag_1) >= "360" & as.character( diag_data$diag_1) <= "389")
                    | (as.character( diag_data$diag_1) >= "740" & as.character( diag_data$diag_1) <= "759")
                    | (startsWith(as.character( diag_data$diag_1), 'E'))
                    | (startsWith(as.character( diag_data$diag_1), 'V'))
                    | (as.character( diag_data$diag_2) == "780") | (as.character( diag_data$diag_2) == "781")
                    | (as.character( diag_data$diag_2) == "784") | (as.character( diag_data$diag_2) >= "790" & as.character( diag_data$diag_2) <= "799")
                    | (as.character( diag_data$diag_2) >= "240" & as.character( diag_data$diag_2) <= "249") | (as.character( diag_data$diag_2) >= "251" & as.character( diag_data$diag_2) <= "279")
                    | (as.character( diag_data$diag_2) >= "680" & as.character( diag_data$diag_2) <= "709") | (as.character( diag_data$diag_2) == "782") 
                    | (as.character( diag_data$diag_2) >= "001" & as.character( diag_data$diag_2) <= "139") | (as.character( diag_data$diag_2) >= "290" & as.character( diag_data$diag_2) <= "319")
                    | (as.character( diag_data$diag_2) >= "280" & as.character( diag_data$diag_2) <= "289") | (as.character( diag_data$diag_2) >= "320" & as.character( diag_data$diag_2) <= "359")
                    | (as.character( diag_data$diag_2) >= "630" & as.character( diag_data$diag_2) <= "679") | (as.character( diag_data$diag_2) >= "360" & as.character( diag_data$diag_2) <= "389")
                    | (as.character( diag_data$diag_2) >= "740" & as.character( diag_data$diag_2) <= "759")
                    | (startsWith(as.character( diag_data$diag_2), 'E')) 
                    | (startsWith(as.character( diag_data$diag_2), 'V'))
                    | (as.character( diag_data$diag_3) == "780") | (as.character( diag_data$diag_3) == "781")
                    | (as.character( diag_data$diag_3) == "784") | (as.character( diag_data$diag_3) >= "790" & as.character( diag_data$diag_3) <= "799")
                    | (as.character( diag_data$diag_3) >= "240" & as.character( diag_data$diag_3) <= "249") | (as.character( diag_data$diag_3) >= "251" & as.character( diag_data$diag_3) <= "279")
                    | (as.character( diag_data$diag_3) >= "680" & as.character( diag_data$diag_3) <= "709") | (as.character( diag_data$diag_3) == "782")
                    | (as.character( diag_data$diag_3) >= "001" & as.character( diag_data$diag_3) <= "139") | (as.character( diag_data$diag_3) >= "290" & as.character( diag_data$diag_3) <= "319")
                    | (as.character( diag_data$diag_3) >= "280" & as.character( diag_data$diag_3) <= "289") | (as.character( diag_data$diag_3) >= "320" & as.character( diag_data$diag_3) <= "359")
                    | (as.character( diag_data$diag_3) >= "630" & as.character( diag_data$diag_3) <= "679") | (as.character( diag_data$diag_3) >= "360" & as.character( diag_data$diag_3) <= "389")
                    | (as.character( diag_data$diag_3) >= "740" & as.character( diag_data$diag_3) <= "759")
                    | (startsWith(as.character( diag_data$diag_3), 'E')) 
                    | (startsWith(as.character( diag_data$diag_3), 'V'))] <- 1

#merge bucketed variables diag_1,2&3 with dataset

data <- diag_data

#check for bucketing and merging

str(data)
## 'data.frame':    289 obs. of  72 variables:
##  $ race                       : Factor w/ 5 levels "AfricanAmerican",..: 3 1 3 5 3 3 4 3 3 3 ...
##  $ gender                     : Factor w/ 2 levels "Female","Male": 2 1 1 2 1 2 1 1 1 2 ...
##  $ age                        : Factor w/ 8 levels "[10-20)","[20-30)",..: 8 7 5 7 3 8 5 4 5 5 ...
##  $ admission_type_id          : Factor w/ 4 levels "1","2","3","6": 4 4 4 4 4 4 4 4 4 4 ...
##  $ discharge_disposition_id   : Factor w/ 9 levels "1","2","3","5",..: 3 1 1 5 1 1 1 1 1 7 ...
##  $ admission_source_id        : Factor w/ 3 levels "1","2","7": 3 3 3 3 2 3 3 3 3 1 ...
##  $ time_in_hospital           : int  5 10 2 11 14 7 2 3 2 4 ...
##  $ num_lab_procedures         : int  47 72 61 71 43 105 66 76 43 41 ...
##  $ num_procedures             : int  1 1 0 1 0 3 0 0 0 1 ...
##  $ num_medications            : int  6 19 5 20 11 16 3 9 13 8 ...
##  $ number_outpatient          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diag_1                     : Factor w/ 90 levels "112","162","188",..: 24 5 19 85 22 36 6 21 71 74 ...
##  $ diag_2                     : Factor w/ 103 levels "162","174","197",..: 28 23 89 9 7 9 42 23 10 66 ...
##  $ diag_3                     : Factor w/ 103 levels "198","208","211",..: 46 28 9 99 65 21 19 31 92 6 ...
##  $ number_diagnoses           : int  5 5 5 5 3 5 3 5 5 3 ...
##  $ max_glu_serum              : Factor w/ 3 levels ">200",">300",..: 1 2 2 1 3 2 3 2 2 1 ...
##  $ A1Cresult                  : Factor w/ 3 levels ">7",">8","Norm": 3 2 2 1 1 1 1 1 1 2 ...
##  $ change                     : Factor w/ 2 levels "Ch","No": 2 1 2 2 2 2 2 1 2 2 ...
##  $ diabetesMed                : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 1 2 ...
##  $ readmitted                 : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 2 2 2 2 2 ...
##  $ race_AfricanAmerican       : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ race_Asian                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Caucasian             : num  1 0 1 0 1 1 0 1 1 1 ...
##  $ race_Hispanic              : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ race_Other                 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ gender.Female              : num  0 1 1 0 1 0 1 1 1 0 ...
##  $ gender.Male                : num  1 0 0 1 0 1 0 0 0 1 ...
##  $ age_10_20                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_20_30                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_30_40                  : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ age_40_50                  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ age_50_60                  : num  0 0 1 0 0 0 1 0 1 1 ...
##  $ age_60_70                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_70_80                  : num  0 1 0 1 0 0 0 0 0 0 ...
##  $ age_80_90                  : num  1 0 0 0 0 1 0 0 0 0 ...
##  $ admission_type_id.1        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.2        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.3        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.6        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ discharge_disposition_id.1 : num  0 1 1 0 1 1 1 1 1 0 ...
##  $ discharge_disposition_id.2 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.3 : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.5 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.6 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.7 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.10: num  0 0 0 0 0 0 0 0 0 1 ...
##  $ discharge_disposition_id.11: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.13: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.1      : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ admission_source_id.2      : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ admission_source_id.7      : num  1 1 1 1 0 1 1 1 1 0 ...
##  $ max_glu_serum_200          : num  1 0 0 1 0 0 0 0 0 1 ...
##  $ max_glu_serum_300          : num  0 1 1 0 0 1 0 1 1 0 ...
##  $ max_glu_serum_Norm         : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ A1Cresult_7                : num  0 0 0 1 1 1 1 1 1 0 ...
##  $ A1Cresult_8                : num  0 1 1 0 0 0 0 0 0 1 ...
##  $ A1Cresult_Norm             : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ med_change_yes             : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ med_change_no              : num  1 0 1 1 1 1 1 0 1 1 ...
##  $ diabetes_med_yes           : num  1 0 0 0 1 0 0 0 1 0 ...
##  $ diabetes_med_no            : num  0 1 1 1 0 1 1 1 0 1 ...
##  $ Circulatory_diag           : num  1 0 0 0 0 1 1 0 0 0 ...
##  $ Respiraoty_diag            : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Digestive_diag             : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Diabetes_diag              : num  0 1 1 1 1 1 1 0 1 1 ...
##  $ Injury_diag                : num  0 0 0 1 0 0 0 0 1 0 ...
##  $ Musktl_diag                : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Genitourinary_diag         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Neoplasm_diag              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_diag                 : num  1 1 1 1 1 1 1 1 1 0 ...
cbind(colnames(data))
##       [,1]                         
##  [1,] "race"                       
##  [2,] "gender"                     
##  [3,] "age"                        
##  [4,] "admission_type_id"          
##  [5,] "discharge_disposition_id"   
##  [6,] "admission_source_id"        
##  [7,] "time_in_hospital"           
##  [8,] "num_lab_procedures"         
##  [9,] "num_procedures"             
## [10,] "num_medications"            
## [11,] "number_outpatient"          
## [12,] "number_emergency"           
## [13,] "number_inpatient"           
## [14,] "diag_1"                     
## [15,] "diag_2"                     
## [16,] "diag_3"                     
## [17,] "number_diagnoses"           
## [18,] "max_glu_serum"              
## [19,] "A1Cresult"                  
## [20,] "change"                     
## [21,] "diabetesMed"                
## [22,] "readmitted"                 
## [23,] "race_AfricanAmerican"       
## [24,] "race_Asian"                 
## [25,] "race_Caucasian"             
## [26,] "race_Hispanic"              
## [27,] "race_Other"                 
## [28,] "gender.Female"              
## [29,] "gender.Male"                
## [30,] "age_10_20 "                 
## [31,] "age_20_30"                  
## [32,] "age_30_40"                  
## [33,] "age_40_50"                  
## [34,] "age_50_60"                  
## [35,] "age_60_70"                  
## [36,] "age_70_80"                  
## [37,] "age_80_90"                  
## [38,] "admission_type_id.1"        
## [39,] "admission_type_id.2"        
## [40,] "admission_type_id.3"        
## [41,] "admission_type_id.6"        
## [42,] "discharge_disposition_id.1" 
## [43,] "discharge_disposition_id.2" 
## [44,] "discharge_disposition_id.3" 
## [45,] "discharge_disposition_id.5" 
## [46,] "discharge_disposition_id.6" 
## [47,] "discharge_disposition_id.7" 
## [48,] "discharge_disposition_id.10"
## [49,] "discharge_disposition_id.11"
## [50,] "discharge_disposition_id.13"
## [51,] "admission_source_id.1"      
## [52,] "admission_source_id.2"      
## [53,] "admission_source_id.7"      
## [54,] "max_glu_serum_200"          
## [55,] "max_glu_serum_300"          
## [56,] "max_glu_serum_Norm"         
## [57,] "A1Cresult_7"                
## [58,] "A1Cresult_8"                
## [59,] "A1Cresult_Norm"             
## [60,] "med_change_yes"             
## [61,] "med_change_no"              
## [62,] "diabetes_med_yes"           
## [63,] "diabetes_med_no"            
## [64,] "Circulatory_diag"           
## [65,] "Respiraoty_diag"            
## [66,] "Digestive_diag"             
## [67,] "Diabetes_diag"              
## [68,] "Injury_diag"                
## [69,] "Musktl_diag"                
## [70,] "Genitourinary_diag"         
## [71,] "Neoplasm_diag"              
## [72,] "Other_diag"

#reorganize column positions: place response variable as last column.

data = data[, c(1:21, 23:72, 22)]

Variables for which dummies have been created will be removed from dataset after EDA

Splitting Data

set.seed(123)
split <- sample.split(data$readmitted, SplitRatio = 0.80)
train_set <- subset(data, split == TRUE)
test_set <- subset(data, split == FALSE)

EDA on train data set.

#summary

summary(train_set)
##               race        gender         age     admission_type_id
##  AfricanAmerican: 33   Female:133   [70-80):50   1: 42            
##  Asian          :  5   Male  : 98   [50-60):47   2:  4            
##  Caucasian      :153                [60-70):47   3:  2            
##  Hispanic       : 31                [80-90):35   6:183            
##  Other          :  9                [40-50):32                    
##                                     [30-40):13                    
##                                     (Other): 7                    
##  discharge_disposition_id admission_source_id time_in_hospital
##  1      :149              1: 21               Min.   : 1.000  
##  3      : 27              2:  2               1st Qu.: 3.000  
##  6      : 24              7:208               Median : 5.000  
##  2      : 15                                  Mean   : 5.459  
##  7      :  7                                  3rd Qu.: 7.000  
##  5      :  5                                  Max.   :14.000  
##  (Other):  4                                                  
##  num_lab_procedures num_procedures   num_medications number_outpatient
##  Min.   : 31.00     Min.   :0.0000   Min.   : 1.00   Min.   :0.0000   
##  1st Qu.: 54.00     1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.:0.0000   
##  Median : 63.00     Median :0.0000   Median :14.00   Median :0.0000   
##  Mean   : 64.19     Mean   :0.8528   Mean   :14.68   Mean   :0.1991   
##  3rd Qu.: 74.00     3rd Qu.:1.5000   3rd Qu.:19.00   3rd Qu.:0.0000   
##  Max.   :106.00     Max.   :6.0000   Max.   :35.00   Max.   :6.0000   
##                                                                       
##  number_emergency number_inpatient     diag_1        diag_2        diag_3   
##  Min.   :0.0000   Min.   :0.0000   491    : 17   250    : 19   250    : 18  
##  1st Qu.:0.0000   1st Qu.:0.0000   428    : 16   250.02 : 14   401    : 16  
##  Median :0.0000   Median :0.0000   682    : 13   276    : 12   276    : 12  
##  Mean   :0.1905   Mean   :0.6667   414    : 12   599    : 10   250.02 : 11  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   786    : 10   411    :  9   414    : 11  
##  Max.   :9.0000   Max.   :7.0000   250.02 :  8   707    :  9   428    :  7  
##                                    (Other):155   (Other):158   (Other):156  
##  number_diagnoses max_glu_serum A1Cresult  change   diabetesMed
##  Min.   :3.000    >200:56       >7  : 54   Ch: 64   No : 99    
##  1st Qu.:5.000    >300:96       >8  :135   No:167   Yes:132    
##  Median :6.000    Norm:79       Norm: 42                       
##  Mean   :5.978                                                 
##  3rd Qu.:6.000                                                 
##  Max.   :9.000                                                 
##                                                                
##  race_AfricanAmerican   race_Asian      race_Caucasian   race_Hispanic   
##  Min.   :0.0000       Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000       1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000       Median :0.00000   Median :1.0000   Median :0.0000  
##  Mean   :0.1429       Mean   :0.02165   Mean   :0.6623   Mean   :0.1342  
##  3rd Qu.:0.0000       3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000       Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##                                                                          
##    race_Other      gender.Female     gender.Male       age_10_20      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :1.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.03896   Mean   :0.5758   Mean   :0.4242   Mean   :0.01732  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                       
##    age_20_30         age_30_40         age_40_50        age_50_60     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.01299   Mean   :0.05628   Mean   :0.1385   Mean   :0.2035  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##                                                                       
##    age_60_70        age_70_80        age_80_90      admission_type_id.1
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000     
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000     
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000     
##  Mean   :0.2035   Mean   :0.2165   Mean   :0.1515   Mean   :0.1818     
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000     
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000     
##                                                                        
##  admission_type_id.2 admission_type_id.3 admission_type_id.6
##  Min.   :0.00000     Min.   :0.000000    Min.   :0.0000     
##  1st Qu.:0.00000     1st Qu.:0.000000    1st Qu.:1.0000     
##  Median :0.00000     Median :0.000000    Median :1.0000     
##  Mean   :0.01732     Mean   :0.008658    Mean   :0.7922     
##  3rd Qu.:0.00000     3rd Qu.:0.000000    3rd Qu.:1.0000     
##  Max.   :1.00000     Max.   :1.000000    Max.   :1.0000     
##                                                             
##  discharge_disposition_id.1 discharge_disposition_id.2
##  Min.   :0.000              Min.   :0.00000           
##  1st Qu.:0.000              1st Qu.:0.00000           
##  Median :1.000              Median :0.00000           
##  Mean   :0.645              Mean   :0.06494           
##  3rd Qu.:1.000              3rd Qu.:0.00000           
##  Max.   :1.000              Max.   :1.00000           
##                                                       
##  discharge_disposition_id.3 discharge_disposition_id.5
##  Min.   :0.0000             Min.   :0.00000           
##  1st Qu.:0.0000             1st Qu.:0.00000           
##  Median :0.0000             Median :0.00000           
##  Mean   :0.1169             Mean   :0.02165           
##  3rd Qu.:0.0000             3rd Qu.:0.00000           
##  Max.   :1.0000             Max.   :1.00000           
##                                                       
##  discharge_disposition_id.6 discharge_disposition_id.7
##  Min.   :0.0000             Min.   :0.0000            
##  1st Qu.:0.0000             1st Qu.:0.0000            
##  Median :0.0000             Median :0.0000            
##  Mean   :0.1039             Mean   :0.0303            
##  3rd Qu.:0.0000             3rd Qu.:0.0000            
##  Max.   :1.0000             Max.   :1.0000            
##                                                       
##  discharge_disposition_id.10 discharge_disposition_id.11
##  Min.   :0                   Min.   :0.00000            
##  1st Qu.:0                   1st Qu.:0.00000            
##  Median :0                   Median :0.00000            
##  Mean   :0                   Mean   :0.01299            
##  3rd Qu.:0                   3rd Qu.:0.00000            
##  Max.   :0                   Max.   :1.00000            
##                                                         
##  discharge_disposition_id.13 admission_source_id.1 admission_source_id.2
##  Min.   :0.000000            Min.   :0.00000       Min.   :0.000000     
##  1st Qu.:0.000000            1st Qu.:0.00000       1st Qu.:0.000000     
##  Median :0.000000            Median :0.00000       Median :0.000000     
##  Mean   :0.004329            Mean   :0.09091       Mean   :0.008658     
##  3rd Qu.:0.000000            3rd Qu.:0.00000       3rd Qu.:0.000000     
##  Max.   :1.000000            Max.   :1.00000       Max.   :1.000000     
##                                                                         
##  admission_source_id.7 max_glu_serum_200 max_glu_serum_300 max_glu_serum_Norm
##  Min.   :0.0000        Min.   :0.0000    Min.   :0.0000    Min.   :0.000     
##  1st Qu.:1.0000        1st Qu.:0.0000    1st Qu.:0.0000    1st Qu.:0.000     
##  Median :1.0000        Median :0.0000    Median :0.0000    Median :0.000     
##  Mean   :0.9004        Mean   :0.2424    Mean   :0.4156    Mean   :0.342     
##  3rd Qu.:1.0000        3rd Qu.:0.0000    3rd Qu.:1.0000    3rd Qu.:1.000     
##  Max.   :1.0000        Max.   :1.0000    Max.   :1.0000    Max.   :1.000     
##                                                                              
##   A1Cresult_7      A1Cresult_8     A1Cresult_Norm   med_change_yes  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2338   Mean   :0.5844   Mean   :0.1818   Mean   :0.2771  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##  med_change_no    diabetes_med_yes diabetes_med_no  Circulatory_diag
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.7229   Mean   :0.4286   Mean   :0.5714   Mean   :0.5411  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##  Respiraoty_diag  Digestive_diag   Diabetes_diag     Injury_diag     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :0.00000  
##  Mean   :0.3074   Mean   :0.1212   Mean   :0.6277   Mean   :0.07359  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##   Musktl_diag      Genitourinary_diag Neoplasm_diag       Other_diag    
##  Min.   :0.00000   Min.   :0.0000     Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000     1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000     Median :0.00000   Median :1.0000  
##  Mean   :0.04329   Mean   :0.1602     Mean   :0.03896   Mean   :0.5541  
##  3rd Qu.:0.00000   3rd Qu.:0.0000     3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000     Max.   :1.00000   Max.   :1.0000  
##                                                                         
##  readmitted
##  No : 93   
##  Yes:138   
##            
##            
##            
##            
## 

Distribution of response variable

readmit_table <- table(train_set$readmitted)
readmit_percent <- round(readmit_table/sum(readmit_table) *100)
labels1 <- paste(names(readmit_table), readmit_percent)
labels2 <- paste(labels1, "%", sep = "")
pie(readmit_table, labels = labels2, col = c("#032149", "#eabeb4"), main = "Distribution of Readmission Variable" ) 

Distributions of all variables

for(each_col in 1:ncol(train_set)){
  barplot(table(train_set[[each_col]]), main = names(train_set)[each_col], xlab = "", ylab = "Frequency", col = "#7289da")
}

#Relationship b/w Length of stay and readmission

readmit_time <- ggplot(train_set, aes(readmitted)) + 
           geom_bar(aes(time_in_hospital, fill = factor(readmitted)), position = 'stack')+
            scale_fill_manual(values = c("#032149",  "#eabeb4"))+
            scale_color_manual(values = c("#032149",  "#eabeb4" ))+
            ylab("Readmission")+
            xlab("Time in Hospital(days)")+
            ggtitle("Time Spent in Hospital Vs Readmission")
readmit_time

#Relationship b/w if a patient is on diabetes medication and readmission

readmit_meds <- ggplot(train_set, aes(readmitted)) + 
           geom_bar(aes(diabetesMed, fill = factor(readmitted)), position = "dodge")+
            scale_fill_manual(values = c("#032149",  "#eabeb4"))+
            scale_color_manual(values = c("#032149",  "#eabeb4" ))+
            ylab("Readmission")+
            xlab("Diabetes Medication")+
            ggtitle("Relationship b/w being on a diabetes medication(s) and readmission")
readmit_meds

#Frequency of readmission by age

age_readmit <- ggplot(train_set, aes(readmitted)) + 
            geom_bar(aes(readmitted, fill = age), position = 'dodge')+
            xlab("Readmission")+
            ylab("Frequnecy")+
            ggtitle("Freq. of Readmission by Age")
            
age_readmit

#Frequency of readmission by race

race_readmit <- ggplot(train_set, aes(readmitted)) + 
            geom_bar(aes(readmitted, fill = race), position = 'dodge')+
            xlab("Readmission")+
            ylab("Frequnecy")+
            ggtitle("Freq. of Readmission by Race")
race_readmit

#Frequency of readmit by gender

gender_readmit <- ggplot(train_set, aes(readmitted)) + 
            geom_bar(aes(readmitted, fill = gender), position = 'dodge')+
            scale_fill_manual(values = c("#032149", "#eabeb4"))+
            scale_color_manual(values = c("#032149", "#eabeb4"))+
            xlab("Readmission")+
            ylab("Frequnecy")+
            ggtitle("Freq. of Readmission by Gender")
            
gender_readmit

#Spread of Readmission by select variables

boxplot_variables <- names(train_set %>% select(c(time_in_hospital, num_lab_procedures, num_procedures, num_medications, number_outpatient, number_inpatient,number_diagnoses)))

                                          
for (each_var in boxplot_variables) {
  box_data <- data.frame(x = train_set$readmitted, y = train_set[[each_var]])
  box_plot <-ggplot(box_data, aes(x= x, y = y, color = x, fill = x))+
  geom_boxplot() + 
  geom_quasirandom(alpha = 0.3)+ 
  xlab("Readmitted") +
  ylab(each_var)+
  ggtitle(paste("Spread of", each_var, "by Readmission"))+
  scale_fill_manual(values = c("#032149",  "#eabeb4"))+
  scale_color_manual(values = c("#032149",  "#eabeb4"))
  
  print(box_plot)
}

Removing variables for which dummies have been created from train and test dataset

train_set <- train_set %>% select(-race, -gender, -age, -admission_type_id, -discharge_disposition_id, -admission_source_id, -max_glu_serum, -A1Cresult, -change, -diabetesMed, -diag_1, -diag_2, -diag_3)
test_set <- test_set %>% select(-race, -gender, -age, -admission_type_id, -discharge_disposition_id, -admission_source_id, -max_glu_serum, -A1Cresult, -change, -diabetesMed, -diag_1, -diag_2, -diag_3)
str(train_set)
## 'data.frame':    231 obs. of  59 variables:
##  $ time_in_hospital           : int  5 2 11 14 3 2 13 3 4 4 ...
##  $ num_lab_procedures         : int  47 61 71 43 76 43 58 43 52 64 ...
##  $ num_procedures             : int  1 0 1 0 0 0 3 0 0 0 ...
##  $ num_medications            : int  6 5 20 11 9 13 17 3 8 3 ...
##  $ number_outpatient          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_diagnoses           : int  5 5 5 3 5 5 5 5 3 4 ...
##  $ race_AfricanAmerican       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ race_Asian                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Caucasian             : num  1 1 0 1 1 1 1 0 0 1 ...
##  $ race_Hispanic              : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ race_Other                 : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ gender.Female              : num  0 1 0 1 1 1 0 1 1 0 ...
##  $ gender.Male                : num  1 0 1 0 0 0 1 0 0 1 ...
##  $ age_10_20                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_20_30                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_30_40                  : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ age_40_50                  : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ age_50_60                  : num  0 1 0 0 0 1 0 0 1 0 ...
##  $ age_60_70                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_70_80                  : num  0 0 1 0 0 0 1 1 0 1 ...
##  $ age_80_90                  : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.1        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.2        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.3        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.6        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ discharge_disposition_id.1 : num  0 1 0 1 1 1 0 1 1 0 ...
##  $ discharge_disposition_id.2 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.3 : num  1 0 0 0 0 0 1 0 0 1 ...
##  $ discharge_disposition_id.5 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.6 : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.7 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.10: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.11: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.13: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.2      : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ admission_source_id.7      : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ max_glu_serum_200          : num  1 0 1 0 0 0 1 1 0 0 ...
##  $ max_glu_serum_300          : num  0 1 0 0 1 1 0 0 1 0 ...
##  $ max_glu_serum_Norm         : num  0 0 0 1 0 0 0 0 0 1 ...
##  $ A1Cresult_7                : num  0 0 1 1 1 1 0 1 0 0 ...
##  $ A1Cresult_8                : num  0 1 0 0 0 0 1 0 1 0 ...
##  $ A1Cresult_Norm             : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ med_change_yes             : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ med_change_no              : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ diabetes_med_yes           : num  1 0 0 1 0 1 0 1 0 0 ...
##  $ diabetes_med_no            : num  0 1 1 0 1 0 1 0 1 1 ...
##  $ Circulatory_diag           : num  1 0 0 0 0 0 1 1 1 0 ...
##  $ Respiraoty_diag            : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Digestive_diag             : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Diabetes_diag              : num  0 1 1 1 0 1 0 1 1 1 ...
##  $ Injury_diag                : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ Musktl_diag                : num  0 0 0 0 0 0 1 0 0 1 ...
##  $ Genitourinary_diag         : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Neoplasm_diag              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_diag                 : num  1 1 1 1 1 1 0 0 0 1 ...
##  $ readmitted                 : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...
str(test_set)
## 'data.frame':    58 obs. of  59 variables:
##  $ time_in_hospital           : int  10 7 2 4 6 5 9 5 10 5 ...
##  $ num_lab_procedures         : int  72 105 66 41 69 95 53 45 81 98 ...
##  $ num_procedures             : int  1 3 0 1 0 2 1 0 0 0 ...
##  $ num_medications            : int  19 16 3 8 17 11 15 11 19 15 ...
##  $ number_outpatient          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_emergency           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient           : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ number_diagnoses           : int  5 5 3 3 5 5 5 5 5 5 ...
##  $ race_AfricanAmerican       : num  1 0 0 0 0 1 0 0 0 1 ...
##  $ race_Asian                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ race_Caucasian             : num  0 1 0 1 0 0 1 1 0 0 ...
##  $ race_Hispanic              : num  0 0 1 0 1 0 0 0 0 0 ...
##  $ race_Other                 : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ gender.Female              : num  1 0 1 0 1 1 0 0 0 1 ...
##  $ gender.Male                : num  0 1 0 1 0 0 1 1 1 0 ...
##  $ age_10_20                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_20_30                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_30_40                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_40_50                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_50_60                  : num  0 0 1 1 0 1 0 0 0 1 ...
##  $ age_60_70                  : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ age_70_80                  : num  1 0 0 0 1 0 1 1 0 0 ...
##  $ age_80_90                  : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.1        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.2        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.3        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.6        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ discharge_disposition_id.1 : num  1 1 1 0 0 1 1 0 0 1 ...
##  $ discharge_disposition_id.2 : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ discharge_disposition_id.3 : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ discharge_disposition_id.5 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.6 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ discharge_disposition_id.7 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.10: num  0 0 0 1 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.11: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.13: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.1      : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ admission_source_id.2      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.7      : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ max_glu_serum_200          : num  0 0 0 1 1 0 0 1 0 0 ...
##  $ max_glu_serum_300          : num  1 1 0 0 0 0 0 0 0 1 ...
##  $ max_glu_serum_Norm         : num  0 0 1 0 0 1 1 0 1 0 ...
##  $ A1Cresult_7                : num  0 1 1 0 0 0 0 0 0 1 ...
##  $ A1Cresult_8                : num  1 0 0 1 0 1 1 0 1 0 ...
##  $ A1Cresult_Norm             : num  0 0 0 0 1 0 0 1 0 0 ...
##  $ med_change_yes             : num  1 0 0 0 0 0 1 0 0 0 ...
##  $ med_change_no              : num  0 1 1 1 1 1 0 1 1 1 ...
##  $ diabetes_med_yes           : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ diabetes_med_no            : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ Circulatory_diag           : num  0 1 1 0 1 1 1 1 1 0 ...
##  $ Respiraoty_diag            : num  0 0 0 1 0 0 0 0 1 0 ...
##  $ Digestive_diag             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diabetes_diag              : num  1 1 1 1 0 1 0 0 0 0 ...
##  $ Injury_diag                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Musktl_diag                : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Genitourinary_diag         : num  0 0 0 0 0 1 0 0 0 1 ...
##  $ Neoplasm_diag              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other_diag                 : num  1 1 1 0 0 0 0 0 0 1 ...
##  $ readmitted                 : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...

Correlation

#Create train correlation data

#only numeric variables excluding dummies
train_cor_data <- train_set[, c("time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",  "number_diagnoses", "number_outpatient", "number_inpatient")]
str(train_cor_data)
## 'data.frame':    231 obs. of  7 variables:
##  $ time_in_hospital  : int  5 2 11 14 3 2 13 3 4 4 ...
##  $ num_lab_procedures: int  47 61 71 43 76 43 58 43 52 64 ...
##  $ num_procedures    : int  1 0 1 0 0 0 3 0 0 0 ...
##  $ num_medications   : int  6 5 20 11 9 13 17 3 8 3 ...
##  $ number_diagnoses  : int  5 5 5 3 5 5 5 5 3 4 ...
##  $ number_outpatient : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient  : int  0 0 0 0 0 0 0 0 0 0 ...

#correlation matrix

train_cor <- cor(train_cor_data, use = "pairwise.complete.obs", method = "pearson")
train_cor
##                    time_in_hospital num_lab_procedures num_procedures
## time_in_hospital        1.000000000         0.27999268     0.23844011
## num_lab_procedures      0.279992675         1.00000000     0.21962159
## num_procedures          0.238440109         0.21962159     1.00000000
## num_medications         0.425852272         0.32974079     0.34204181
## number_diagnoses        0.129669950         0.17004561     0.20551003
## number_outpatient       0.009321861         0.09255762     0.08655823
## number_inpatient        0.073733675        -0.04523063    -0.02002753
##                    num_medications number_diagnoses number_outpatient
## time_in_hospital         0.4258523        0.1296700       0.009321861
## num_lab_procedures       0.3297408        0.1700456       0.092557624
## num_procedures           0.3420418        0.2055100       0.086558231
## num_medications          1.0000000        0.4868194       0.246360104
## number_diagnoses         0.4868194        1.0000000       0.480487005
## number_outpatient        0.2463601        0.4804870       1.000000000
## number_inpatient         0.2546947        0.2853367       0.272736656
##                    number_inpatient
## time_in_hospital         0.07373367
## num_lab_procedures      -0.04523063
## num_procedures          -0.02002753
## num_medications          0.25469465
## number_diagnoses         0.28533668
## number_outpatient        0.27273666
## number_inpatient         1.00000000

#correlation plot

train_cor_plot <- corrplot(train_cor, method = "number", order = "alphabet", tl.cex = 0.8, tl.srt = 45,  col = c("#092c86","#666666", "#f3bbc1"))

#ggpair plot of select variables

ggpair_data <- cbind(train_cor_data, train_set$readmitted)
ggpairs(ggpair_data, aes(color = train_set$readmitted, alpha = 0.4))+ theme_bw() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Handle Zero Variance variables

#Determine zero variance variables - train set

train_zvar <- colnames(train_set)[nearZeroVar(train_set)]
train_zvar
##  [1] "number_emergency"            "race_Asian"                 
##  [3] "race_Other"                  "age_10_20 "                 
##  [5] "age_20_30"                   "admission_type_id.2"        
##  [7] "admission_type_id.3"         "discharge_disposition_id.5" 
##  [9] "discharge_disposition_id.7"  "discharge_disposition_id.10"
## [11] "discharge_disposition_id.11" "discharge_disposition_id.13"
## [13] "admission_source_id.2"       "Musktl_diag"                
## [15] "Neoplasm_diag"

#Remove zero variance variables - train set

train_variance <- train_set %>% select(-all_of(train_zvar))
str(train_variance)
## 'data.frame':    231 obs. of  44 variables:
##  $ time_in_hospital          : int  5 2 11 14 3 2 13 3 4 4 ...
##  $ num_lab_procedures        : int  47 61 71 43 76 43 58 43 52 64 ...
##  $ num_procedures            : int  1 0 1 0 0 0 3 0 0 0 ...
##  $ num_medications           : int  6 5 20 11 9 13 17 3 8 3 ...
##  $ number_outpatient         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_diagnoses          : int  5 5 5 3 5 5 5 5 3 4 ...
##  $ race_AfricanAmerican      : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ race_Caucasian            : num  1 1 0 1 1 1 1 0 0 1 ...
##  $ race_Hispanic             : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ gender.Female             : num  0 1 0 1 1 1 0 1 1 0 ...
##  $ gender.Male               : num  1 0 1 0 0 0 1 0 0 1 ...
##  $ age_30_40                 : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ age_40_50                 : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ age_50_60                 : num  0 1 0 0 0 1 0 0 1 0 ...
##  $ age_60_70                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_70_80                 : num  0 0 1 0 0 0 1 1 0 1 ...
##  $ age_80_90                 : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.6       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ discharge_disposition_id.1: num  0 1 0 1 1 1 0 1 1 0 ...
##  $ discharge_disposition_id.2: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ discharge_disposition_id.3: num  1 0 0 0 0 0 1 0 0 1 ...
##  $ discharge_disposition_id.6: num  0 0 1 0 0 0 0 0 0 0 ...
##  $ admission_source_id.1     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_source_id.7     : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ max_glu_serum_200         : num  1 0 1 0 0 0 1 1 0 0 ...
##  $ max_glu_serum_300         : num  0 1 0 0 1 1 0 0 1 0 ...
##  $ max_glu_serum_Norm        : num  0 0 0 1 0 0 0 0 0 1 ...
##  $ A1Cresult_7               : num  0 0 1 1 1 1 0 1 0 0 ...
##  $ A1Cresult_8               : num  0 1 0 0 0 0 1 0 1 0 ...
##  $ A1Cresult_Norm            : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ med_change_yes            : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ med_change_no             : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ diabetes_med_yes          : num  1 0 0 1 0 1 0 1 0 0 ...
##  $ diabetes_med_no           : num  0 1 1 0 1 0 1 0 1 1 ...
##  $ Circulatory_diag          : num  1 0 0 0 0 0 1 1 1 0 ...
##  $ Respiraoty_diag           : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Digestive_diag            : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ Diabetes_diag             : num  0 1 1 1 0 1 0 1 1 1 ...
##  $ Injury_diag               : num  0 0 1 0 0 1 0 0 0 0 ...
##  $ Genitourinary_diag        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Other_diag                : num  1 1 1 1 1 1 0 0 0 1 ...
##  $ readmitted                : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...

#Remove zero variance variables - test set

test_variance <- test_set %>% select(-all_of(train_zvar))
str(test_variance)
## 'data.frame':    58 obs. of  44 variables:
##  $ time_in_hospital          : int  10 7 2 4 6 5 9 5 10 5 ...
##  $ num_lab_procedures        : int  72 105 66 41 69 95 53 45 81 98 ...
##  $ num_procedures            : int  1 3 0 1 0 2 1 0 0 0 ...
##  $ num_medications           : int  19 16 3 8 17 11 15 11 19 15 ...
##  $ number_outpatient         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ number_inpatient          : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ number_diagnoses          : int  5 5 3 3 5 5 5 5 5 5 ...
##  $ race_AfricanAmerican      : num  1 0 0 0 0 1 0 0 0 1 ...
##  $ race_Caucasian            : num  0 1 0 1 0 0 1 1 0 0 ...
##  $ race_Hispanic             : num  0 0 1 0 1 0 0 0 0 0 ...
##  $ gender.Female             : num  1 0 1 0 1 1 0 0 0 1 ...
##  $ gender.Male               : num  0 1 0 1 0 0 1 1 1 0 ...
##  $ age_30_40                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_40_50                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age_50_60                 : num  0 0 1 1 0 1 0 0 0 1 ...
##  $ age_60_70                 : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ age_70_80                 : num  1 0 0 0 1 0 1 1 0 0 ...
##  $ age_80_90                 : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ admission_type_id.6       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ discharge_disposition_id.1: num  1 1 1 0 0 1 1 0 0 1 ...
##  $ discharge_disposition_id.2: num  0 0 0 0 0 0 0 1 0 0 ...
##  $ discharge_disposition_id.3: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ discharge_disposition_id.6: num  0 0 0 0 1 0 0 0 0 0 ...
##  $ admission_source_id.1     : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ admission_source_id.7     : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ max_glu_serum_200         : num  0 0 0 1 1 0 0 1 0 0 ...
##  $ max_glu_serum_300         : num  1 1 0 0 0 0 0 0 0 1 ...
##  $ max_glu_serum_Norm        : num  0 0 1 0 0 1 1 0 1 0 ...
##  $ A1Cresult_7               : num  0 1 1 0 0 0 0 0 0 1 ...
##  $ A1Cresult_8               : num  1 0 0 1 0 1 1 0 1 0 ...
##  $ A1Cresult_Norm            : num  0 0 0 0 1 0 0 1 0 0 ...
##  $ med_change_yes            : num  1 0 0 0 0 0 1 0 0 0 ...
##  $ med_change_no             : num  0 1 1 1 1 1 0 1 1 1 ...
##  $ diabetes_med_yes          : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ diabetes_med_no           : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ Circulatory_diag          : num  0 1 1 0 1 1 1 1 1 0 ...
##  $ Respiraoty_diag           : num  0 0 0 1 0 0 0 0 1 0 ...
##  $ Digestive_diag            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diabetes_diag             : num  1 1 1 1 0 1 0 0 0 0 ...
##  $ Injury_diag               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Genitourinary_diag        : num  0 0 0 0 0 1 0 0 0 1 ...
##  $ Other_diag                : num  1 1 1 0 0 0 0 0 0 1 ...
##  $ readmitted                : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...

Scaling Data

#Scale Train dataset and using Preprocessed train data to scale test_set

prep_train_set <- preProcess(train_variance %>% select(-readmitted), method = c("center", "scale"))
train_set <- predict(prep_train_set, train_variance)
test_set <- predict(prep_train_set, test_variance)

#Check variance of each column

variance <- apply(train_variance, 2, var)
variance
##           time_in_hospital         num_lab_procedures 
##                 9.29286655               203.76085074 
##             num_procedures            num_medications 
##                 1.58693770                56.20282326 
##          number_outpatient           number_inpatient 
##                 0.56017316                 1.59710145 
##           number_diagnoses       race_AfricanAmerican 
##                 2.24735554                 0.12298137 
##             race_Caucasian              race_Hispanic 
##                 0.22461886                 0.11669490 
##              gender.Female                gender.Male 
##                 0.24532279                 0.24532279 
##                  age_30_40                  age_40_50 
##                 0.05334086                 0.11985695 
##                  age_50_60                  age_60_70 
##                 0.16277056                 0.16277056 
##                  age_70_80                  age_80_90 
##                 0.17033691                 0.12911726 
##        admission_type_id.1        admission_type_id.6 
##                 0.14940711                 0.16533032 
## discharge_disposition_id.1 discharge_disposition_id.2 
##                 0.22996424                 0.06098250 
## discharge_disposition_id.3 discharge_disposition_id.6 
##                 0.10367024                 0.09350649 
##      admission_source_id.1      admission_source_id.7 
##                 0.08300395                 0.09004329 
##          max_glu_serum_200          max_glu_serum_300 
##                 0.18445323                 0.24392998 
##         max_glu_serum_Norm                A1Cresult_7 
##                 0.22601167                 0.17989836 
##                A1Cresult_8             A1Cresult_Norm 
##                 0.24392998                 0.14940711 
##             med_change_yes              med_change_no 
##                 0.20116695                 0.20116695 
##           diabetes_med_yes            diabetes_med_no 
##                 0.24596273                 0.24596273 
##           Circulatory_diag            Respiraoty_diag 
##                 0.24938829                 0.21381517 
##             Digestive_diag              Diabetes_diag 
##                 0.10698287                 0.23470732 
##                Injury_diag         Genitourinary_diag 
##                 0.06847356                 0.13510258 
##                 Other_diag                 readmitted 
##                 0.24814606                         NA

Dimensionality Reduction

PCA

pca_num_train <- train_set[, c("time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications",  "number_diagnoses", "number_outpatient", "number_inpatient")]
pca_model <- prcomp(pca_num_train, 
                    center = T, 
                    scale = T)
pca_model
## Standard deviations (1, .., p=7):
## [1] 1.5499985 1.1758734 0.9378776 0.8855213 0.7877915 0.7390788 0.6198477
## 
## Rotation (n x k) = (7 x 7):
##                          PC1        PC2        PC3           PC4         PC5
## time_in_hospital   0.3360624 -0.4173462  0.5243925 -0.0412094105  0.48866905
## num_lab_procedures 0.3124108 -0.3999764 -0.2245250 -0.6699986370 -0.47725077
## num_procedures     0.3208567 -0.3401964 -0.3342444  0.7189787806 -0.30569941
## num_medications    0.5255492 -0.1069897  0.1629984  0.0484577298  0.07532914
## number_diagnoses   0.4705238  0.3050575 -0.2289782  0.0009183564  0.21949363
## number_outpatient  0.3470551  0.4689049 -0.3748550 -0.1550765841  0.25472725
## number_inpatient   0.2616690  0.4751092  0.5859743  0.0779447376 -0.56678411
##                            PC6         PC7
## time_in_hospital   -0.35635406 -0.26535406
## num_lab_procedures -0.06652207 -0.10447579
## num_procedures     -0.22597967 -0.09026985
## num_medications     0.44066294  0.69539652
## number_diagnoses    0.48855922 -0.58842833
## number_outpatient  -0.60132700  0.26198432
## number_inpatient   -0.15182785 -0.10983339

#Obtain eigenvalues

get_eigenvalue(pca_model)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.4024952        34.321361                    34.32136
## Dim.2  1.3826783        19.752547                    54.07391
## Dim.3  0.8796144        12.565920                    66.63983
## Dim.4  0.7841480        11.202114                    77.84194
## Dim.5  0.6206155         8.865936                    86.70788
## Dim.6  0.5462374         7.803392                    94.51127
## Dim.7  0.3842112         5.488731                   100.00000

#Plot eigenvalues

fviz_eig(pca_model)

#Create PCA train and test datasets

#pca data to be used for clustering and model performance evaluation. pcaComp = 5 covers ~87% of information from data.  
pca = preProcess(x = train_variance %>% select(-readmitted), method = c("center", "scale", 'pca'), pcaComp = 5)
train_pca = predict(pca, train_set) 
test_pca = predict(pca, test_set)
str(train_pca)
## 'data.frame':    231 obs. of  6 variables:
##  $ readmitted: Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...
##  $ PC1       : num  -7.129 -0.761 -2.792 -5.812 -0.197 ...
##  $ PC2       : num  0.187 -1.638 -1.497 3.896 -5.88 ...
##  $ PC3       : num  3.86 -4.15 1.6 -4.26 -2.23 ...
##  $ PC4       : num  1.66 4.11 3.63 10.45 3.41 ...
##  $ PC5       : num  1.14 -3.05 -3.53 -1.4 -3.93 ...
str(test_pca)
## 'data.frame':    58 obs. of  6 variables:
##  $ readmitted: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
##  $ PC1       : num  1.2 -3.15 -6.9 -1.7 -5.9 ...
##  $ PC2       : num  -5.99 -2.91 -2.49 -2.98 1.15 ...
##  $ PC3       : num  -5.683 0.626 -3.637 -0.812 -0.578 ...
##  $ PC4       : num  2.326 4.206 1.199 12.217 -0.728 ...
##  $ PC5       : num  -4.608 0.681 -3.918 -1.695 -4.187 ...
head(train_pca)
##      readmitted        PC1        PC2       PC3       PC4       PC5
## 163         Yes -7.1285149  0.1872657  3.861544  1.658994  1.139496
## 594          No -0.7613097 -1.6382055 -4.152343  4.114732 -3.054692
## 697          No -2.7919409 -1.4972945  1.604623  3.633316 -3.527514
## 772         Yes -5.8115645  3.8963328 -4.258017 10.450170 -1.403868
## 1281        Yes -0.1965211 -5.8801111 -2.226753  3.408606 -3.930573
## 1756        Yes -3.7727694  3.1367656 -2.632466  3.150225 -4.129599
head(test_pca)
##      readmitted       PC1       PC2        PC3        PC4        PC5
## 461         Yes  1.198539 -5.994778 -5.6833471  2.3261126 -4.6080128
## 824         Yes -3.149068 -2.913959  0.6261374  4.2059116  0.6811802
## 962         Yes -6.899306 -2.492718 -3.6369469  1.1987444 -3.9182227
## 1984        Yes -1.696995 -2.981183 -0.8120658 12.2168597 -1.6951108
## 2309        Yes -5.897093  1.151353 -0.5780209 -0.7283363 -4.1872207
## 5016        Yes -4.071932 -1.917166 -5.0786976  0.4321735 -2.7598897

Clustering

Hierarchical Clustering

#Compute Euclidean Distance between numerical observations in training dataset

hc_train = hclust(d = dist(train_variance %>% select(-readmitted), method = 'euclidean'), method = 'ward.D')
hc_train
## 
## Call:
## hclust(d = dist(train_variance %>% select(-readmitted), method = "euclidean"),     method = "ward.D")
## 
## Cluster method   : ward.D 
## Distance         : euclidean 
## Number of objects: 231

#plot Dendrogram

plot(hc_train, main = "Dendrogram", xlab = "Observations", ylab = "Euclidean distances")

#categorize into three clusters (based on the dendrogram)

y_hc_train = cutree(hc_train, 3)
y_hc_train
##    163    594    697    772   1281   1756   2059   2080   2484   3277   4104 
##      1      2      3      1      3      1      2      1      2      2      2 
##   4857   4967   5149   5313   5388   5771   6953   8808   9119  10040  11979 
##      3      2      3      2      3      2      2      3      3      2      3 
##  12275  12838  13235  13365  13549  13634  14644  15583  16431  16920  17097 
##      2      1      2      2      3      3      2      2      3      3      3 
##  17710  17836  17839  22004  22438  23303  23465  24103  24810  25950  26277 
##      2      1      1      2      2      3      2      3      2      2      2 
##  26379  26453  26497  26826  26840  27472  27669  28370  28491  28772  28951 
##      1      1      2      2      1      3      2      2      3      3      3 
##  29058  29127  29135  29181  29256  29485  29487  29822  30885  31158  31165 
##      1      1      3      2      3      2      2      3      1      2      1 
##  31275  31319  31767  32134  32700  32829  32948  33020  33439  33946  35185 
##      1      1      3      2      2      2      3      3      1      2      2 
##  35443  35777  36745  36987  37015  37249  37402  37411  37656  37954  37987 
##      2      1      1      3      1      2      2      1      3      2      3 
##  38180  38566  39013  39068  39275  39356  39362  40311  40319  40338  40749 
##      3      2      3      2      2      1      2      1      3      2      2 
##  40816  40971  41134  41729  42248  42257  43185  43450  44040  44171  46386 
##      2      2      2      1      2      2      2      2      2      3      1 
##  48474  49293  50337  51185  52679  52765  53181  53261  53376  53547  53672 
##      2      2      1      2      2      3      3      1      3      2      2 
##  53687  54254  54342  55002  55120  55888  56068  57260  57410  57670  57806 
##      1      3      3      2      2      2      1      2      1      3      1 
##  57811  58769  59667  60148  63412  64163  65813  66253  66258  67102  67316 
##      1      2      3      3      3      1      3      3      1      2      2 
##  67406  67529  67616  67638  67716  67995  68230  68427  68516  68656  68770 
##      3      2      2      3      2      2      1      1      3      1      2 
##  69010  69118  69479  69621  69627  70127  70175  70357  70553  71045  71151 
##      1      3      2      3      2      2      2      2      3      1      2 
##  71309  71802  71940  73441  74273  74284  74831  75153  75404  75661  76113 
##      1      2      3      3      1      2      1      1      2      3      2 
##  76869  76954  77058  77281  77674  77817  79345  79526  79574  80203  80254 
##      1      1      3      3      2      3      1      2      3      3      3 
##  80411  80568  80810  81744  82474  82677  83262  83746  84340  84980  85153 
##      1      1      1      2      2      2      3      3      2      1      3 
##  85158  85205  85250  86386  86641  86942  87705  87731  89829  90542  90750 
##      2      3      2      2      2      3      3      3      2      3      3 
##  90824  91002  91881  91960  92591  93124  93323  93894  93938  94140  94519 
##      3      2      1      3      2      1      3      3      3      3      1 
##  94756  96459  97084  97207  97739  98208  99005 100387 100494 100579 101089 
##      2      2      1      1      1      2      2      1      3      3      2

#plot clusters

clusplot(train_variance %>% select(-readmitted), 
         y_hc_train,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients (Hierarchical)")

K-means Clustering

k_mean = kmeans(x = train_variance %>% select(-readmitted), centers = 3)
y_means = k_mean$cluster
clusplot(train_variance %>% select(-readmitted),  
         y_means,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients (K-means)")

Optimum number of Clusters using Silhouette,Gap Statistic and Elbow Methods

#Silhouette method and k-means

fviz_nbclust(train_variance %>% select(-readmitted), kmeans, method = "silhouette") + labs(subtitle = "Silhoutte method")

#Silhouette method and hierarchical

fviz_nbclust(train_variance %>% select(-readmitted), hcut, method = "silhouette")+ labs(subtitle = "Silhouette method")

#Gap statistic method and kmeans

fviz_nbclust(train_variance %>% select(-readmitted), kmeans, nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method")

#Gap statistic method and hierarchical

fviz_nbclust(train_variance %>% select(-readmitted), hcut , nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method")

#Elbow method and hierarchical

fviz_nbclust(train_variance %>% select(-readmitted), hcut, method = "wss")+ labs(subbtitle = "Elbow Method")

#Elbow method and kmeans

fviz_nbclust(train_variance %>% select(-readmitted), kmeans, method = "wss")+ labs(subbtitle = "Elbow Method")

Cluster Plots based on outcome of cluster methods above*

#categorize into two clusters

y_hc_train_plot = cutree(hc_train, 2)
y_hc_train_plot
##    163    594    697    772   1281   1756   2059   2080   2484   3277   4104 
##      1      1      2      1      2      1      1      1      1      1      1 
##   4857   4967   5149   5313   5388   5771   6953   8808   9119  10040  11979 
##      2      1      2      1      2      1      1      2      2      1      2 
##  12275  12838  13235  13365  13549  13634  14644  15583  16431  16920  17097 
##      1      1      1      1      2      2      1      1      2      2      2 
##  17710  17836  17839  22004  22438  23303  23465  24103  24810  25950  26277 
##      1      1      1      1      1      2      1      2      1      1      1 
##  26379  26453  26497  26826  26840  27472  27669  28370  28491  28772  28951 
##      1      1      1      1      1      2      1      1      2      2      2 
##  29058  29127  29135  29181  29256  29485  29487  29822  30885  31158  31165 
##      1      1      2      1      2      1      1      2      1      1      1 
##  31275  31319  31767  32134  32700  32829  32948  33020  33439  33946  35185 
##      1      1      2      1      1      1      2      2      1      1      1 
##  35443  35777  36745  36987  37015  37249  37402  37411  37656  37954  37987 
##      1      1      1      2      1      1      1      1      2      1      2 
##  38180  38566  39013  39068  39275  39356  39362  40311  40319  40338  40749 
##      2      1      2      1      1      1      1      1      2      1      1 
##  40816  40971  41134  41729  42248  42257  43185  43450  44040  44171  46386 
##      1      1      1      1      1      1      1      1      1      2      1 
##  48474  49293  50337  51185  52679  52765  53181  53261  53376  53547  53672 
##      1      1      1      1      1      2      2      1      2      1      1 
##  53687  54254  54342  55002  55120  55888  56068  57260  57410  57670  57806 
##      1      2      2      1      1      1      1      1      1      2      1 
##  57811  58769  59667  60148  63412  64163  65813  66253  66258  67102  67316 
##      1      1      2      2      2      1      2      2      1      1      1 
##  67406  67529  67616  67638  67716  67995  68230  68427  68516  68656  68770 
##      2      1      1      2      1      1      1      1      2      1      1 
##  69010  69118  69479  69621  69627  70127  70175  70357  70553  71045  71151 
##      1      2      1      2      1      1      1      1      2      1      1 
##  71309  71802  71940  73441  74273  74284  74831  75153  75404  75661  76113 
##      1      1      2      2      1      1      1      1      1      2      1 
##  76869  76954  77058  77281  77674  77817  79345  79526  79574  80203  80254 
##      1      1      2      2      1      2      1      1      2      2      2 
##  80411  80568  80810  81744  82474  82677  83262  83746  84340  84980  85153 
##      1      1      1      1      1      1      2      2      1      1      2 
##  85158  85205  85250  86386  86641  86942  87705  87731  89829  90542  90750 
##      1      2      1      1      1      2      2      2      1      2      2 
##  90824  91002  91881  91960  92591  93124  93323  93894  93938  94140  94519 
##      2      1      1      2      1      1      2      2      2      2      1 
##  94756  96459  97084  97207  97739  98208  99005 100387 100494 100579 101089 
##      1      1      1      1      1      1      1      1      2      2      1

#plot clusters

clusplot(train_variance %>% select(-readmitted), 
         y_hc_train_plot,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients(Hierarchical)-Based on outcomes of cluster methods")

K-means Clustering

k_mean_plot = kmeans(x = train_variance %>% select(-readmitted), centers = 2)
y_means_plot = k_mean_plot$cluster
clusplot(train_variance %>% select(-readmitted),  
         y_means,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients(K-means):Based on outcomes of cluster plot methods")

Hierarchical Clustering using PCA data

#Compute Euclidean Distance between numerical observations in training PCA dataset

hc_train_pca = hclust(d = dist(train_pca %>% select(-readmitted), method = 'euclidean'), method = 'ward.D')
hc_train_pca
## 
## Call:
## hclust(d = dist(train_pca %>% select(-readmitted), method = "euclidean"),     method = "ward.D")
## 
## Cluster method   : ward.D 
## Distance         : euclidean 
## Number of objects: 231

#plot Dendrogram - PCA

plot(hc_train_pca, main = "Dendrogram", xlab = "Observations", ylab = "Euclidean distances")

#categorize into two clusters (based on the dendrogram) - PCA

y_hc_train_pca = cutree(hc_train, 3)
y_hc_train_pca
##    163    594    697    772   1281   1756   2059   2080   2484   3277   4104 
##      1      2      3      1      3      1      2      1      2      2      2 
##   4857   4967   5149   5313   5388   5771   6953   8808   9119  10040  11979 
##      3      2      3      2      3      2      2      3      3      2      3 
##  12275  12838  13235  13365  13549  13634  14644  15583  16431  16920  17097 
##      2      1      2      2      3      3      2      2      3      3      3 
##  17710  17836  17839  22004  22438  23303  23465  24103  24810  25950  26277 
##      2      1      1      2      2      3      2      3      2      2      2 
##  26379  26453  26497  26826  26840  27472  27669  28370  28491  28772  28951 
##      1      1      2      2      1      3      2      2      3      3      3 
##  29058  29127  29135  29181  29256  29485  29487  29822  30885  31158  31165 
##      1      1      3      2      3      2      2      3      1      2      1 
##  31275  31319  31767  32134  32700  32829  32948  33020  33439  33946  35185 
##      1      1      3      2      2      2      3      3      1      2      2 
##  35443  35777  36745  36987  37015  37249  37402  37411  37656  37954  37987 
##      2      1      1      3      1      2      2      1      3      2      3 
##  38180  38566  39013  39068  39275  39356  39362  40311  40319  40338  40749 
##      3      2      3      2      2      1      2      1      3      2      2 
##  40816  40971  41134  41729  42248  42257  43185  43450  44040  44171  46386 
##      2      2      2      1      2      2      2      2      2      3      1 
##  48474  49293  50337  51185  52679  52765  53181  53261  53376  53547  53672 
##      2      2      1      2      2      3      3      1      3      2      2 
##  53687  54254  54342  55002  55120  55888  56068  57260  57410  57670  57806 
##      1      3      3      2      2      2      1      2      1      3      1 
##  57811  58769  59667  60148  63412  64163  65813  66253  66258  67102  67316 
##      1      2      3      3      3      1      3      3      1      2      2 
##  67406  67529  67616  67638  67716  67995  68230  68427  68516  68656  68770 
##      3      2      2      3      2      2      1      1      3      1      2 
##  69010  69118  69479  69621  69627  70127  70175  70357  70553  71045  71151 
##      1      3      2      3      2      2      2      2      3      1      2 
##  71309  71802  71940  73441  74273  74284  74831  75153  75404  75661  76113 
##      1      2      3      3      1      2      1      1      2      3      2 
##  76869  76954  77058  77281  77674  77817  79345  79526  79574  80203  80254 
##      1      1      3      3      2      3      1      2      3      3      3 
##  80411  80568  80810  81744  82474  82677  83262  83746  84340  84980  85153 
##      1      1      1      2      2      2      3      3      2      1      3 
##  85158  85205  85250  86386  86641  86942  87705  87731  89829  90542  90750 
##      2      3      2      2      2      3      3      3      2      3      3 
##  90824  91002  91881  91960  92591  93124  93323  93894  93938  94140  94519 
##      3      2      1      3      2      1      3      3      3      3      1 
##  94756  96459  97084  97207  97739  98208  99005 100387 100494 100579 101089 
##      2      2      1      1      1      2      2      1      3      3      2

#plot clusters - PCA

clusplot(train_pca %>% select(-readmitted), 
         y_hc_train_pca,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients (Hierarchical)- PCA data")

K-means Clustering - PCA

k_mean_pca = kmeans(x = train_pca %>% select(-readmitted), centers = 2)
y_means_pca = k_mean_pca$cluster
clusplot(train_pca %>% select(-readmitted),  
         y_means_pca,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients (K-means) - PCA data")

Optimum number of Clusters using Silhouette,Gap Statistic and Elbow Methods for PCA data

#Silhouette method and k-means - PCA

fviz_nbclust(train_pca %>% select(-readmitted), kmeans, method = "silhouette") + labs(subtitle = "Silhoutte method - PCA data")

#Silhouette method and hierarchical - PCA

fviz_nbclust(train_pca %>% select(-readmitted), hcut, method = "silhouette")+ labs(subtitle = "Silhouette method - PCA data")

#Gap statistic method and kmeans - PCA

fviz_nbclust(train_pca %>% select(-readmitted), kmeans, nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method - PCA data")

#Gap statistic method and hierarchical - PCA

fviz_nbclust(train_pca %>% select(-readmitted), hcut , nstart = 25, method = "gap_stat", nboot = 50)+ labs(subtitle = "Gap statistic method - PCA data")

#Elbow method and hierarchical - PCA

fviz_nbclust(train_pca %>% select(-readmitted), hcut, method = "wss")+ labs(subbtitle = "Elbow Method - PCA data")

#Elbow method and kmeans - PCA

fviz_nbclust(train_pca %>% select(-readmitted), kmeans, method = "wss")+ labs(subbtitle = "Elbow Method - PCA data")

Plot Clusters based on outcome of above methods

#categorize into 4 clusters - PCA

y_hc_train_plot_pca = cutree(hc_train, 4)
y_hc_train_plot_pca
##    163    594    697    772   1281   1756   2059   2080   2484   3277   4104 
##      1      2      3      1      3      1      2      1      2      2      2 
##   4857   4967   5149   5313   5388   5771   6953   8808   9119  10040  11979 
##      3      2      4      2      3      2      2      3      3      2      3 
##  12275  12838  13235  13365  13549  13634  14644  15583  16431  16920  17097 
##      2      1      2      2      3      3      2      2      4      3      3 
##  17710  17836  17839  22004  22438  23303  23465  24103  24810  25950  26277 
##      2      1      1      2      2      3      2      3      2      2      2 
##  26379  26453  26497  26826  26840  27472  27669  28370  28491  28772  28951 
##      1      1      2      2      1      3      2      2      3      3      4 
##  29058  29127  29135  29181  29256  29485  29487  29822  30885  31158  31165 
##      1      1      3      2      3      2      2      3      1      2      1 
##  31275  31319  31767  32134  32700  32829  32948  33020  33439  33946  35185 
##      1      1      3      2      2      2      3      3      1      2      2 
##  35443  35777  36745  36987  37015  37249  37402  37411  37656  37954  37987 
##      2      1      1      3      1      2      2      1      3      2      4 
##  38180  38566  39013  39068  39275  39356  39362  40311  40319  40338  40749 
##      3      2      3      2      2      1      2      1      3      2      2 
##  40816  40971  41134  41729  42248  42257  43185  43450  44040  44171  46386 
##      2      2      2      1      2      2      2      2      2      3      1 
##  48474  49293  50337  51185  52679  52765  53181  53261  53376  53547  53672 
##      2      2      1      2      2      3      3      1      3      2      2 
##  53687  54254  54342  55002  55120  55888  56068  57260  57410  57670  57806 
##      1      3      4      2      2      2      1      2      1      3      1 
##  57811  58769  59667  60148  63412  64163  65813  66253  66258  67102  67316 
##      1      2      3      4      3      1      3      3      1      2      2 
##  67406  67529  67616  67638  67716  67995  68230  68427  68516  68656  68770 
##      3      2      2      3      2      2      1      1      3      1      2 
##  69010  69118  69479  69621  69627  70127  70175  70357  70553  71045  71151 
##      1      3      2      3      2      2      2      2      4      1      2 
##  71309  71802  71940  73441  74273  74284  74831  75153  75404  75661  76113 
##      1      2      3      3      1      2      1      1      2      3      2 
##  76869  76954  77058  77281  77674  77817  79345  79526  79574  80203  80254 
##      1      1      4      3      2      3      1      2      4      4      3 
##  80411  80568  80810  81744  82474  82677  83262  83746  84340  84980  85153 
##      1      1      1      2      2      2      3      3      2      1      3 
##  85158  85205  85250  86386  86641  86942  87705  87731  89829  90542  90750 
##      2      4      2      2      2      4      3      3      2      3      3 
##  90824  91002  91881  91960  92591  93124  93323  93894  93938  94140  94519 
##      3      2      1      4      2      1      3      3      3      4      1 
##  94756  96459  97084  97207  97739  98208  99005 100387 100494 100579 101089 
##      2      2      1      1      1      2      2      1      3      4      2

#plot clusters - PCA

clusplot(train_pca %>% select(-readmitted), 
         y_hc_train_plot_pca,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients(Hierarchical):Based on methods' outcome w/ PCA data")

K-means Clustering - PCA

k_mean_plot_pca = kmeans(x = train_pca %>% select(-readmitted), centers = 4)
y_means_plot_pca = k_mean_plot_pca$cluster
clusplot(train_pca %>% select(-readmitted),  
         y_means_plot_pca,
         lines = 0, 
         shade = T,
         color = T,
         labels = 2,
         plotchar = F,
         span = T,
         main = "Clusters of Patients (K-means):Based on methods' outcome w/ PCA data")

Supervised Models

#Initiate model outcomes comparison lists

Model_Names <- c()
Model_AUC_Values <- c()

Logistic Regression

#LR model1: all variables

lr_model1 <- glm(formula = readmitted~., family = binomial, data = train_set)
summary(lr_model1)
## 
## Call:
## glm(formula = readmitted ~ ., family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2578  -0.8508   0.4198   0.7898   1.9126  
## 
## Coefficients: (5 not defined because of singularities)
##                              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                  0.671189   8.143002   0.082   0.9343   
## time_in_hospital             0.505113   0.229413   2.202   0.0277 * 
## num_lab_procedures          -0.613587   0.203776  -3.011   0.0026 **
## num_procedures              -0.165364   0.194378  -0.851   0.3949   
## num_medications              0.437743   0.254940   1.717   0.0860 . 
## number_outpatient            0.158439   0.248601   0.637   0.5239   
## number_inpatient             0.426799   0.253119   1.686   0.0918 . 
## number_diagnoses            -0.055302   0.286420  -0.193   0.8469   
## race_AfricanAmerican        -0.557919   0.293540  -1.901   0.0573 . 
## race_Caucasian              -0.210426   0.342733  -0.614   0.5392   
## race_Hispanic               -0.052851   0.283040  -0.187   0.8519   
## gender.Female                0.179833   0.185335   0.970   0.3319   
## gender.Male                        NA         NA      NA       NA   
## age_30_40                   -0.609509   0.286917  -2.124   0.0336 * 
## age_40_50                   -0.555756   0.381370  -1.457   0.1450   
## age_50_60                   -0.784620   0.451991  -1.736   0.0826 . 
## age_60_70                   -0.793687   0.461006  -1.722   0.0851 . 
## age_70_80                   -0.822054   0.456394  -1.801   0.0717 . 
## age_80_90                   -0.791266   0.421328  -1.878   0.0604 . 
## admission_type_id.1          0.838138   0.568296   1.475   0.1403   
## admission_type_id.6          0.512948   0.618424   0.829   0.4069   
## discharge_disposition_id.1   0.204641   0.311731   0.656   0.5115   
## discharge_disposition_id.2   0.365556   0.230544   1.586   0.1128   
## discharge_disposition_id.3   0.368761   0.266565   1.383   0.1665   
## discharge_disposition_id.6  -0.125522   0.248839  -0.504   0.6140   
## admission_source_id.1       -4.158026 270.909636  -0.015   0.9878   
## admission_source_id.7       -4.719514 282.163384  -0.017   0.9867   
## max_glu_serum_200            0.042715   0.195501   0.218   0.8270   
## max_glu_serum_300            0.062625   0.258818   0.242   0.8088   
## max_glu_serum_Norm                 NA         NA      NA       NA   
## A1Cresult_7                 -0.016264   0.231401  -0.070   0.9440   
## A1Cresult_8                 -0.001258   0.263973  -0.005   0.9962   
## A1Cresult_Norm                     NA         NA      NA       NA   
## med_change_yes               0.065503   0.229605   0.285   0.7754   
## med_change_no                      NA         NA      NA       NA   
## diabetes_med_yes            -0.040807   0.217926  -0.187   0.8515   
## diabetes_med_no                    NA         NA      NA       NA   
## Circulatory_diag            -0.101579   0.225510  -0.450   0.6524   
## Respiraoty_diag              0.057752   0.206678   0.279   0.7799   
## Digestive_diag               0.017185   0.193284   0.089   0.9292   
## Diabetes_diag               -0.444971   0.203907  -2.182   0.0291 * 
## Injury_diag                  0.027498   0.193633   0.142   0.8871   
## Genitourinary_diag          -0.096998   0.177963  -0.545   0.5857   
## Other_diag                  -0.449687   0.228882  -1.965   0.0494 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 235.20  on 192  degrees of freedom
## AIC: 313.2
## 
## Number of Fisher Scoring iterations: 14

#LR model2 - with all significant variables from lr model1

lr_model2 <- glm(formula = readmitted~ time_in_hospital + num_lab_procedures + num_medications + number_outpatient + number_inpatient + age_30_40+ age_50_60 +age_60_70 + age_70_80+ age_80_90 + Diabetes_diag + Other_diag,  family = binomial, data = train_set)
summary(lr_model2)
## 
## Call:
## glm(formula = readmitted ~ time_in_hospital + num_lab_procedures + 
##     num_medications + number_outpatient + number_inpatient + 
##     age_30_40 + age_50_60 + age_60_70 + age_70_80 + age_80_90 + 
##     Diabetes_diag + Other_diag, family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4497  -1.0196   0.5353   0.8958   1.7010  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.5189     0.1563   3.320  0.00090 ***
## time_in_hospital     0.4573     0.1852   2.469  0.01355 *  
## num_lab_procedures  -0.5495     0.1713  -3.209  0.00133 ** 
## num_medications      0.3573     0.2030   1.760  0.07841 .  
## number_outpatient    0.1888     0.2039   0.926  0.35456    
## number_inpatient     0.5245     0.2317   2.264  0.02360 *  
## age_30_40           -0.2917     0.1715  -1.701  0.08892 .  
## age_50_60           -0.2529     0.2086  -1.213  0.22526    
## age_60_70           -0.2871     0.2205  -1.302  0.19304    
## age_70_80           -0.2396     0.2109  -1.136  0.25596    
## age_80_90           -0.2614     0.2034  -1.285  0.19882    
## Diabetes_diag       -0.2969     0.1615  -1.838  0.06599 .  
## Other_diag          -0.4166     0.1652  -2.521  0.01169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 262.76  on 218  degrees of freedom
## AIC: 288.76
## 
## Number of Fisher Scoring iterations: 5

#LR model3

lr_model3 <- glm(formula = readmitted~  time_in_hospital + num_lab_procedures + number_inpatient + Other_diag, family = binomial, data = train_set)
summary(lr_model3)
## 
## Call:
## glm(formula = readmitted ~ time_in_hospital + num_lab_procedures + 
##     number_inpatient + Other_diag, family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3390  -1.0886   0.5733   0.9820   1.7639  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.5104     0.1527   3.342 0.000831 ***
## time_in_hospital     0.4978     0.1639   3.037 0.002391 ** 
## num_lab_procedures  -0.3767     0.1533  -2.458 0.013987 *  
## number_inpatient     0.6997     0.2202   3.177 0.001486 ** 
## Other_diag          -0.4160     0.1477  -2.817 0.004852 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 275.51  on 226  degrees of freedom
## AIC: 285.51
## 
## Number of Fisher Scoring iterations: 4

#LR model 4: only most significant variable (**)

lr_model4 <- glm(formula = readmitted~ time_in_hospital +  number_inpatient + Other_diag, family = binomial, data = train_set)
summary(lr_model4)
## 
## Call:
## glm(formula = readmitted ~ time_in_hospital + number_inpatient + 
##     Other_diag, family = binomial, data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1614  -1.1161   0.6079   1.0182   1.5116  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.4934     0.1498   3.294 0.000988 ***
## time_in_hospital   0.3734     0.1501   2.488 0.012836 *  
## number_inpatient   0.6944     0.2139   3.247 0.001166 ** 
## Other_diag        -0.3789     0.1446  -2.620 0.008801 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 281.78  on 227  degrees of freedom
## AIC: 289.78
## 
## Number of Fisher Scoring iterations: 4

#performance comparison

compare_performance(lr_model1, lr_model2, lr_model3, lr_model4)
## # Comparison of Model Performance Indices
## 
## Name      | Model | AIC (weights) | AICc (weights) | BIC (weights) | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log | Score_spherical |   PCP
## -----------------------------------------------------------------------------------------------------------------------------------------------
## lr_model1 |   glm | 313.2 (<.001) |  329.5 (<.001) | 447.5 (<.001) |     0.293 | 0.414 | 1.107 |    0.509 |      -Inf |           0.004 | 0.660
## lr_model2 |   glm | 288.8 (0.149) |  290.4 (0.079) | 333.5 (<.001) |     0.192 | 0.441 | 1.098 |    0.569 |  -151.099 |           0.004 | 0.611
## lr_model3 |   glm | 285.5 (0.761) |  285.8 (0.819) | 302.7 (0.602) |     0.145 | 0.453 | 1.104 |    0.596 |  -145.517 |           0.004 | 0.589
## lr_model4 |   glm | 289.8 (0.090) |  290.0 (0.101) | 303.5 (0.398) |     0.118 | 0.460 | 1.114 |    0.610 |  -141.505 |           0.005 | 0.576

Logistic regression predictions on test data using LR model3 of train data (model with lowest AIC value)

lr_result = predict(lr_model3, test_set, type = "response")
lr_result
##       461       824       962      1984      2309      5016      6129      6452 
## 0.5754908 0.2579972 0.3007636 0.7266454 0.7538063 0.4294611 0.8840238 0.7379612 
##      7698      9471     11728     15359     15538     17795     18511     20963 
## 0.7113244 0.2317860 0.9085133 0.1734862 0.6725922 0.6765962 0.6281069 0.3213621 
##     23943     25851     29060     29366     30649     31935     32130     34606 
## 0.2377085 0.6452716 0.4060798 0.7415831 0.6642826 0.6535344 0.9284307 0.5916334 
##     35975     36349     36685     39784     39833     40684     44259     44286 
## 0.5690315 0.8791495 0.6486647 0.3281264 0.9131998 0.4768299 0.7098697 0.7586708 
##     46721     51517     57291     57846     58435     60051     65125     66118 
## 0.6509185 0.5451385 0.6674764 0.6764589 0.7699843 0.4025018 0.9542761 0.3095083 
##     66395     68026     68269     68348     70681     71862     72980     74933 
## 0.6192066 0.4253061 0.4333837 0.5862032 0.6788687 0.3624103 0.9924719 0.4775360 
##     76453     76775     76814     77043     78993     79192     82660     84909 
## 0.6506927 0.8744509 0.6827756 0.4423166 0.4793012 0.2737639 0.6213054 0.5606325 
##     88479    101030 
## 0.4240962 0.4537406

#Thresholding to obtain class for confusion matrix

result_class0.5 = ifelse(lr_result >=0.5, 1, 0)
result_class0.5
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##      1      0      0      1      1      0      1      1      1      0      1 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##      0      1      1      1      0      0      1      0      1      1      1 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##      1      1      1      1      1      0      1      0      1      1      1 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##      1      1      1      1      0      1      0      1      0      0      1 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##      1      0      1      0      1      1      1      0      0      0      1 
##  84909  88479 101030 
##      1      0      0
table(test_set$readmitted, result_class0.5)
##      result_class0.5
##        0  1
##   No  13 10
##   Yes  8 27

#Accuracy (threshold 0.5)

accuracy0.5 <- (16+37)/(16+37+19+15)
accuracy0.5
## [1] 0.6091954

#Sensitivty(threshold = 0.5)

#sensitivity = TP/(TP + FN)
sensitivity0.5 = 16/(16+19)
sensitivity0.5
## [1] 0.4571429

#Specificity(threshold = 0.5)

#specificity = TN/(TN+FP)
specificity0.5 = 37/(37+15)
specificity0.5
## [1] 0.7115385

#LR ROC

lr_roc <- roc(test_set$readmitted, lr_result, plot = TRUE, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#LR AUC

lr_auc <- auc(test_set$readmitted, lr_result)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
lr_auc
## Area under the curve: 0.7379

#Append comparison lists

Model_Names <- append(Model_Names, "Logistic Regression")
Model_AUC_Values <- append(Model_AUC_Values, lr_auc)

Elastic Net

#GLMNet

glmnet_model = cv.glmnet(as.matrix(train_set %>% select(-readmitted)), train_set$readmitted, family = "binomial", alpha = 0.5)
plot(glmnet_model)

glmnet_model
## 
## Call:  cv.glmnet(x = as.matrix(train_set %>% select(-readmitted)), y = train_set$readmitted,      family = "binomial", alpha = 0.5) 
## 
## Measure: Binomial Deviance 
## 
##      Lambda Index Measure      SE Nonzero
## min 0.04374    19   1.280 0.03130      21
## 1se 0.08389    12   1.309 0.02736      14
glmnet_model$lambda.min
## [1] 0.04374214
glmnet_model$lambda.1se
## [1] 0.0838935

#Retrieve coefficients

coef(glmnet_model, s = glmnet_model$lambda.min)
## 44 x 1 sparse Matrix of class "dgCMatrix"
##                                      s1
## (Intercept)                 0.448369055
## time_in_hospital            0.224994675
## num_lab_procedures         -0.236882822
## num_procedures              .          
## num_medications             0.203465214
## number_outpatient           0.016879812
## number_inpatient            0.316770733
## number_diagnoses            .          
## race_AfricanAmerican       -0.197707893
## race_Caucasian              .          
## race_Hispanic               .          
## gender.Female               0.045637428
## gender.Male                -0.042845674
## age_30_40                  -0.056903353
## age_40_50                   .          
## age_50_60                   .          
## age_60_70                   .          
## age_70_80                   .          
## age_80_90                   .          
## admission_type_id.1         0.174575358
## admission_type_id.6         .          
## discharge_disposition_id.1  .          
## discharge_disposition_id.2  0.088496691
## discharge_disposition_id.3  0.099408984
## discharge_disposition_id.6 -0.097540298
## admission_source_id.1       .          
## admission_source_id.7      -0.119507446
## max_glu_serum_200           .          
## max_glu_serum_300           .          
## max_glu_serum_Norm          .          
## A1Cresult_7                 .          
## A1Cresult_8                 .          
## A1Cresult_Norm              .          
## med_change_yes              0.008450632
## med_change_no              -0.007006306
## diabetes_med_yes           -0.005011454
## diabetes_med_no             0.003269135
## Circulatory_diag            .          
## Respiraoty_diag             0.071180199
## Digestive_diag              .          
## Diabetes_diag              -0.114325117
## Injury_diag                 .          
## Genitourinary_diag          .          
## Other_diag                 -0.207451761
coef(glmnet_model, s = glmnet_model$lambda.1se)
## 44 x 1 sparse Matrix of class "dgCMatrix"
##                                      s1
## (Intercept)                 0.414940997
## time_in_hospital            0.110886976
## num_lab_procedures         -0.096337939
## num_procedures              .          
## num_medications             0.164124330
## number_outpatient           .          
## number_inpatient            0.240549917
## number_diagnoses            .          
## race_AfricanAmerican       -0.101037482
## race_Caucasian              .          
## race_Hispanic               .          
## gender.Female               0.009251185
## gender.Male                -0.008286248
## age_30_40                   .          
## age_40_50                   .          
## age_50_60                   .          
## age_60_70                   .          
## age_70_80                   .          
## age_80_90                   .          
## admission_type_id.1         0.053206738
## admission_type_id.6         .          
## discharge_disposition_id.1  .          
## discharge_disposition_id.2  0.016630006
## discharge_disposition_id.3  0.039877769
## discharge_disposition_id.6  .          
## admission_source_id.1       .          
## admission_source_id.7      -0.015855431
## max_glu_serum_200           .          
## max_glu_serum_300           .          
## max_glu_serum_Norm          .          
## A1Cresult_7                 .          
## A1Cresult_8                 .          
## A1Cresult_Norm              .          
## med_change_yes              .          
## med_change_no               .          
## diabetes_med_yes            .          
## diabetes_med_no             .          
## Circulatory_diag            .          
## Respiraoty_diag             0.039137301
## Digestive_diag              .          
## Diabetes_diag              -0.029490578
## Injury_diag                 .          
## Genitourinary_diag          .          
## Other_diag                 -0.135774004

#Prediction on test dataset

glmnet_pred = predict(glmnet_model, newx=as.matrix(test_set%>% select (-readmitted)), type = "response", s=glmnet_model$lambda.1se)
glmnet_pred
##               s1
## 461    0.5109715
## 824    0.4747913
## 962    0.4332065
## 1984   0.6122516
## 2309   0.6652422
## 5016   0.4512088
## 6129   0.6952248
## 6452   0.6277509
## 7698   0.6852019
## 9471   0.4157722
## 11728  0.6905312
## 15359  0.4366775
## 15538  0.6330190
## 17795  0.6091232
## 18511  0.5444884
## 20963  0.4865441
## 23943  0.4292829
## 25851  0.5053022
## 29060  0.5526081
## 29366  0.6091707
## 30649  0.6204999
## 31935  0.5721938
## 32130  0.7641792
## 34606  0.5414921
## 35975  0.5638125
## 36349  0.7194968
## 36685  0.6613101
## 39784  0.4035714
## 39833  0.7136276
## 40684  0.5514611
## 44259  0.5835540
## 44286  0.6873014
## 46721  0.6030548
## 51517  0.5285979
## 57291  0.6813965
## 57846  0.5419784
## 58435  0.6736669
## 60051  0.4745504
## 65125  0.7618593
## 66118  0.4725333
## 66395  0.6840611
## 68026  0.5640196
## 68269  0.4773355
## 68348  0.4706460
## 70681  0.7503135
## 71862  0.4339513
## 72980  0.8711432
## 74933  0.4133803
## 76453  0.4942775
## 76775  0.6524288
## 76814  0.6449857
## 77043  0.4979473
## 78993  0.6682822
## 79192  0.4783805
## 82660  0.6862439
## 84909  0.6337811
## 88479  0.5547753
## 101030 0.5645004
glmnet_class = ifelse(glmnet_pred > 0.5, 1, 0)
table(glmnet_class)
## glmnet_class
##  0  1 
## 17 41
glmnet_roc <- roc(test_set$readmitted, as.numeric(glmnet_pred), auc = T, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#glmtnet AUC

glmnet_auc <- auc(glmnet_roc)
glmnet_auc
## Area under the curve: 0.7093

#Append comparison lists

Model_Names <- append(Model_Names, "GLMNet")
Model_AUC_Values <- append(Model_AUC_Values, glmnet_auc)

Decision Tree

#Model

d_tree = rpart(formula = readmitted ~., method = 'class', data = train_set)

#Prediction on test data

y_pred_tree = predict(d_tree, newdata = test_set)
y_pred_tree
##               No       Yes
## 461    0.8571429 0.1428571
## 824    0.6363636 0.3636364
## 962    0.7878788 0.2121212
## 1984   0.1428571 0.8571429
## 2309   0.1612903 0.8387097
## 5016   0.8571429 0.1428571
## 6129   0.1612903 0.8387097
## 6452   0.2745098 0.7254902
## 7698   0.2800000 0.7200000
## 9471   0.8571429 0.1428571
## 11728  0.3846154 0.6153846
## 15359  0.7878788 0.2121212
## 15538  0.2800000 0.7200000
## 17795  0.1612903 0.8387097
## 18511  0.1428571 0.8571429
## 20963  0.7878788 0.2121212
## 23943  0.7878788 0.2121212
## 25851  0.1428571 0.8571429
## 29060  1.0000000 0.0000000
## 29366  0.1612903 0.8387097
## 30649  0.2745098 0.7254902
## 31935  0.2745098 0.7254902
## 32130  0.1612903 0.8387097
## 34606  0.2745098 0.7254902
## 35975  0.2800000 0.7200000
## 36349  0.1612903 0.8387097
## 36685  0.2745098 0.7254902
## 39784  0.3846154 0.6153846
## 39833  0.1612903 0.8387097
## 40684  0.2745098 0.7254902
## 44259  0.2745098 0.7254902
## 44286  0.1612903 0.8387097
## 46721  0.2800000 0.7200000
## 51517  0.2745098 0.7254902
## 57291  0.1612903 0.8387097
## 57846  0.1612903 0.8387097
## 58435  0.1612903 0.8387097
## 60051  0.3846154 0.6153846
## 65125  0.1612903 0.8387097
## 66118  0.8571429 0.1428571
## 66395  0.2800000 0.7200000
## 68026  0.2745098 0.7254902
## 68269  0.3846154 0.6153846
## 68348  0.3846154 0.6153846
## 70681  0.1612903 0.8387097
## 71862  0.8571429 0.1428571
## 72980  0.1612903 0.8387097
## 74933  0.7878788 0.2121212
## 76453  0.1428571 0.8571429
## 76775  0.2745098 0.7254902
## 76814  0.2745098 0.7254902
## 77043  0.8750000 0.1250000
## 78993  0.2800000 0.7200000
## 79192  0.4285714 0.5714286
## 82660  0.1612903 0.8387097
## 84909  0.2800000 0.7200000
## 88479  0.2745098 0.7254902
## 101030 0.2800000 0.7200000

#Confusion matrix(treshold = 0.5)

y_pred_class0.5 = ifelse(y_pred_tree[, 'Yes'] >= 0.5, "Yes", "No" )
y_pred_class0.5
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##   "No"   "No"   "No"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes"   "No"  "Yes" 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##   "No"  "Yes"  "Yes"  "Yes"   "No"   "No"  "Yes"   "No"  "Yes"  "Yes"  "Yes" 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes" 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes"  "Yes" 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##  "Yes"   "No"  "Yes"   "No"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes" 
##  84909  88479 101030 
##  "Yes"  "Yes"  "Yes"
cm_0.5 <- table(test_set$readmitted, y_pred_class0.5)
cm_0.5
##      y_pred_class0.5
##       No Yes
##   No   7  16
##   Yes  6  29
y_pred_class_fac0.5 = factor(y_pred_class0.5)
y_pred_class_fac0.5
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##     No     No     No    Yes    Yes     No    Yes    Yes    Yes     No    Yes 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##     No    Yes    Yes    Yes     No     No    Yes     No    Yes    Yes    Yes 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##    Yes    Yes    Yes    Yes    Yes    Yes    Yes    Yes    Yes    Yes    Yes 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##    Yes    Yes    Yes    Yes    Yes    Yes     No    Yes    Yes    Yes    Yes 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##    Yes     No    Yes     No    Yes    Yes    Yes     No    Yes    Yes    Yes 
##  84909  88479 101030 
##    Yes    Yes    Yes 
## Levels: No Yes
confusionMatrix(test_set$readmitted, y_pred_class_fac0.5)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   7  16
##        Yes  6  29
##                                           
##                Accuracy : 0.6207          
##                  95% CI : (0.4837, 0.7449)
##     No Information Rate : 0.7759          
##     P-Value [Acc > NIR] : 0.99764         
##                                           
##                   Kappa : 0.1436          
##                                           
##  Mcnemar's Test P-Value : 0.05501         
##                                           
##             Sensitivity : 0.5385          
##             Specificity : 0.6444          
##          Pos Pred Value : 0.3043          
##          Neg Pred Value : 0.8286          
##              Prevalence : 0.2241          
##          Detection Rate : 0.1207          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.5915          
##                                           
##        'Positive' Class : No              
## 

#Confusion matrix (threshold = 0.2)

y_pred_class0.2 = ifelse(y_pred_tree[, "Yes"]>=0.2, "Yes", "No")
y_pred_class0.2
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##   "No"  "Yes"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes"   "No"  "Yes" 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes" 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes" 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes"  "Yes" 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##  "Yes"   "No"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes" 
##  84909  88479 101030 
##  "Yes"  "Yes"  "Yes"
cm_0.2 <- table(test_set$readmitted, y_pred_class0.2)
cm_0.2
##      y_pred_class0.2
##       No Yes
##   No   5  18
##   Yes  2  33
y_pred_class_fac0.2 = factor(y_pred_class0.2)
table(y_pred_class_fac0.2)
## y_pred_class_fac0.2
##  No Yes 
##   7  51
confusionMatrix(test_set$readmitted, y_pred_class_fac0.2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   5  18
##        Yes  2  33
##                                           
##                Accuracy : 0.6552          
##                  95% CI : (0.5188, 0.7751)
##     No Information Rate : 0.8793          
##     P-Value [Acc > NIR] : 0.9999981       
##                                           
##                   Kappa : 0.1819          
##                                           
##  Mcnemar's Test P-Value : 0.0007962       
##                                           
##             Sensitivity : 0.71429         
##             Specificity : 0.64706         
##          Pos Pred Value : 0.21739         
##          Neg Pred Value : 0.94286         
##              Prevalence : 0.12069         
##          Detection Rate : 0.08621         
##    Detection Prevalence : 0.39655         
##       Balanced Accuracy : 0.68067         
##                                           
##        'Positive' Class : No              
## 

#DT ROC

dt_roc = roc(test_set$readmitted, y_pred_tree[, 'Yes'])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
dt_roc_plot <- plot(dt_roc, print.auc = TRUE)

#DT AUC

dt_auc <- auc(dt_roc_plot)
dt_auc
## Area under the curve: 0.6261

#Append comparison lists

Model_Names <- append(Model_Names, "Decision Tree")
Model_AUC_Values <- append(Model_AUC_Values, dt_auc)

#Plotting

plot(d_tree, uniform = TRUE, main = "Readmission Prediction")
text(d_tree, use.n = TRUE, all = TRUE)

prp(d_tree, type = 1, extra = 3, under = TRUE, varlen = 0, tweak = 1, main = "Readmission Prediction")

Random Forest

#RF model

rf_model <- randomForest(formula = readmitted ~., method = 'class', data = train_set)
rf_model 
## 
## Call:
##  randomForest(formula = readmitted ~ ., data = train_set, method = "class") 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 39.83%
## Confusion matrix:
##     No Yes class.error
## No  32  61   0.6559140
## Yes 31 107   0.2246377

#Class for confusion matrix

y_pred_rf = predict(rf_model, newdata = test_set %>% select(-readmitted))

#Probability for ROC And AUC

y_prob_rf = predict(rf_model, newdata = test_set, type = 'prob')
y_prob_rf
##           No   Yes
## 461    0.322 0.678
## 824    0.548 0.452
## 962    0.688 0.312
## 1984   0.436 0.564
## 2309   0.348 0.652
## 5016   0.674 0.326
## 6129   0.268 0.732
## 6452   0.320 0.680
## 7698   0.252 0.748
## 9471   0.656 0.344
## 11728  0.460 0.540
## 15359  0.550 0.450
## 15538  0.258 0.742
## 17795  0.414 0.586
## 18511  0.544 0.456
## 20963  0.634 0.366
## 23943  0.692 0.308
## 25851  0.630 0.370
## 29060  0.538 0.462
## 29366  0.398 0.602
## 30649  0.452 0.548
## 31935  0.466 0.534
## 32130  0.216 0.784
## 34606  0.514 0.486
## 35975  0.356 0.644
## 36349  0.166 0.834
## 36685  0.230 0.770
## 39784  0.630 0.370
## 39833  0.202 0.798
## 40684  0.490 0.510
## 44259  0.476 0.524
## 44286  0.210 0.790
## 46721  0.276 0.724
## 51517  0.604 0.396
## 57291  0.462 0.538
## 57846  0.396 0.604
## 58435  0.326 0.674
## 60051  0.678 0.322
## 65125  0.162 0.838
## 66118  0.552 0.448
## 66395  0.362 0.638
## 68026  0.366 0.634
## 68269  0.782 0.218
## 68348  0.612 0.388
## 70681  0.200 0.800
## 71862  0.550 0.450
## 72980  0.178 0.822
## 74933  0.870 0.130
## 76453  0.566 0.434
## 76775  0.326 0.674
## 76814  0.270 0.730
## 77043  0.532 0.468
## 78993  0.154 0.846
## 79192  0.462 0.538
## 82660  0.382 0.618
## 84909  0.324 0.676
## 88479  0.450 0.550
## 101030 0.340 0.660
## attr(,"class")
## [1] "matrix" "array"  "votes"

#RF ROC

rf_roc <- roc(test_set$readmitted, y_prob_rf[, "Yes"], plot = TRUE, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#RF AUC

rf_auc <- auc(rf_roc)
rf_auc
## Area under the curve: 0.6938

#Feature Importance

importance(rf_model)
##                            MeanDecreaseGini
## time_in_hospital                   8.342252
## num_lab_procedures                11.867214
## num_procedures                     3.712192
## num_medications                   13.245162
## number_outpatient                  1.414403
## number_inpatient                   5.973916
## number_diagnoses                   3.779270
## race_AfricanAmerican               2.244809
## race_Caucasian                     2.153812
## race_Hispanic                      1.109640
## gender.Female                      2.355329
## gender.Male                        2.454000
## age_30_40                          1.232779
## age_40_50                          1.105829
## age_50_60                          1.650378
## age_60_70                          1.575285
## age_70_80                          1.744514
## age_80_90                          1.446605
## admission_type_id.1                1.115223
## admission_type_id.6                1.072231
## discharge_disposition_id.1         2.139967
## discharge_disposition_id.2         1.134374
## discharge_disposition_id.3         1.690191
## discharge_disposition_id.6         1.585194
## admission_source_id.1              1.063924
## admission_source_id.7              1.388172
## max_glu_serum_200                  1.871660
## max_glu_serum_300                  1.640048
## max_glu_serum_Norm                 1.669858
## A1Cresult_7                        1.472351
## A1Cresult_8                        1.940613
## A1Cresult_Norm                     1.551391
## med_change_yes                     1.404087
## med_change_no                      1.550146
## diabetes_med_yes                   1.854193
## diabetes_med_no                    1.824120
## Circulatory_diag                   1.954859
## Respiraoty_diag                    2.425489
## Digestive_diag                     1.064355
## Diabetes_diag                      2.316744
## Injury_diag                        1.020229
## Genitourinary_diag                 1.376744
## Other_diag                         3.002588

SVM

#SVM model(kernel = linear)

classifier_linear = svm(formula = readmitted~., data = train_variance, kernel = "linear")
classifier_linear
## 
## Call:
## svm(formula = readmitted ~ ., data = train_variance, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  156

#Predictions on test data

y_pred_svml = predict(classifier_linear, newdata = test_variance)

#Confusion matrix

cm_svml = table(test_variance$readmitted, y_pred_svml)
cm_svml
##      y_pred_svml
##       No Yes
##   No  15   8
##   Yes 13  22
confusionMatrix(test_variance$readmitted, y_pred_svml)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  15   8
##        Yes 13  22
##                                           
##                Accuracy : 0.6379          
##                  95% CI : (0.5012, 0.7601)
##     No Information Rate : 0.5172          
##     P-Value [Acc > NIR] : 0.0431          
##                                           
##                   Kappa : 0.2707          
##                                           
##  Mcnemar's Test P-Value : 0.3827          
##                                           
##             Sensitivity : 0.5357          
##             Specificity : 0.7333          
##          Pos Pred Value : 0.6522          
##          Neg Pred Value : 0.6286          
##              Prevalence : 0.4828          
##          Detection Rate : 0.2586          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6345          
##                                           
##        'Positive' Class : No              
## 

#SVM model (kernel = radial)

classifier_radial = svm(formula = readmitted~., data = train_variance, kernel = "radial")
classifier_radial
## 
## Call:
## svm(formula = readmitted ~ ., data = train_variance, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  210

#Prediction on test data

y_pred_svmr = predict(classifier_radial, newdata = test_variance)
y_pred_svmr
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##    Yes     No     No    Yes    Yes     No    Yes    Yes    Yes     No    Yes 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##     No    Yes     No    Yes     No     No     No     No    Yes    Yes    Yes 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##    Yes     No    Yes    Yes    Yes     No    Yes     No     No    Yes    Yes 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##     No    Yes    Yes    Yes     No    Yes     No    Yes    Yes     No     No 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##    Yes    Yes    Yes     No     No    Yes    Yes     No    Yes    Yes    Yes 
##  84909  88479 101030 
##    Yes    Yes    Yes 
## Levels: No Yes

#Confusion matrix

cm_svmr = table(test_variance$readmitted, y_pred_svmr)
cm_svmr
##      y_pred_svmr
##       No Yes
##   No  13  10
##   Yes  9  26
confusionMatrix(test_variance$readmitted, y_pred_svmr)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  13  10
##        Yes  9  26
##                                           
##                Accuracy : 0.6724          
##                  95% CI : (0.5366, 0.7899)
##     No Information Rate : 0.6207          
##     P-Value [Acc > NIR] : 0.2514          
##                                           
##                   Kappa : 0.3104          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5909          
##             Specificity : 0.7222          
##          Pos Pred Value : 0.5652          
##          Neg Pred Value : 0.7429          
##              Prevalence : 0.3793          
##          Detection Rate : 0.2241          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6566          
##                                           
##        'Positive' Class : No              
## 

#SVM tune

svm_tune <- tune(svm, train.x = train_variance %>% select(-readmitted), 
                 train.y = train_variance$readmitted, 
                 kernel = "radial", ranges = list(cost = 10^(-1:2), gamma = c(0.25, 0.5, 1,2)))
svm_tune
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1  0.25
## 
## - best performance: 0.3934783

#SVM model using best cost and gamma parameters obtained after tuning.

classifier_cg = svm(formula = as.factor(train_variance$readmitted)~., data = train_variance, type = "C-classification", kernel = "radial", cost = 10, gamma = 0.25)

#Predictions on test set

y_pred_cg = predict(classifier_cg, newdata = test_variance)
cm_cg = table(test_variance$readmitted, y_pred_cg)
cm_cg
##      y_pred_cg
##       No Yes
##   No   0  23
##   Yes  1  34
confusionMatrix(test_variance$readmitted, y_pred_cg)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   0  23
##        Yes  1  34
##                                          
##                Accuracy : 0.5862         
##                  95% CI : (0.4493, 0.714)
##     No Information Rate : 0.9828         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0342        
##                                          
##  Mcnemar's Test P-Value : 1.814e-05      
##                                          
##             Sensitivity : 0.00000        
##             Specificity : 0.59649        
##          Pos Pred Value : 0.00000        
##          Neg Pred Value : 0.97143        
##              Prevalence : 0.01724        
##          Detection Rate : 0.00000        
##    Detection Prevalence : 0.39655        
##       Balanced Accuracy : 0.29825        
##                                          
##        'Positive' Class : No             
## 
summary(classifier_cg)
## 
## Call:
## svm(formula = as.factor(train_variance$readmitted) ~ ., data = train_variance, 
##     type = "C-classification", kernel = "radial", cost = 10, gamma = 0.25)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  231
## 
##  ( 138 93 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  No Yes

#SVM ROC

svm_roc = roc(test_variance$readmitted, factor(y_pred_cg, ordered = T), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#SVM AUC

svm_auc <- auc(svm_roc)
svm_auc
## Area under the curve: 0.4857

#Append comparison lists

Model_Names <- append(Model_Names, "SVM")
Model_AUC_Values <- append(Model_AUC_Values, svm_auc)

K-nn

#Determine best k value

vec = c()
k_vec = c()
for(k in 1:80){
  y_pred_knn = knn(train = train_set %>% select(-readmitted), 
                 test = test_set %>% select(-readmitted),
                 cl = train_set$readmitted,
               k=k)
  error = mean(y_pred_knn != test_set$readmitted)
  k_vec = c(k_vec, k)
  vec = c(vec, error)
}
cbind(vec)
##             vec
##  [1,] 0.4827586
##  [2,] 0.4482759
##  [3,] 0.4482759
##  [4,] 0.3793103
##  [5,] 0.4310345
##  [6,] 0.4482759
##  [7,] 0.4310345
##  [8,] 0.3793103
##  [9,] 0.3448276
## [10,] 0.3965517
## [11,] 0.3275862
## [12,] 0.2931034
## [13,] 0.2931034
## [14,] 0.3103448
## [15,] 0.2586207
## [16,] 0.2758621
## [17,] 0.2931034
## [18,] 0.3103448
## [19,] 0.3103448
## [20,] 0.3103448
## [21,] 0.3448276
## [22,] 0.3275862
## [23,] 0.3275862
## [24,] 0.2931034
## [25,] 0.3448276
## [26,] 0.3448276
## [27,] 0.3448276
## [28,] 0.2931034
## [29,] 0.3275862
## [30,] 0.3103448
## [31,] 0.3103448
## [32,] 0.2931034
## [33,] 0.2758621
## [34,] 0.2758621
## [35,] 0.2758621
## [36,] 0.2931034
## [37,] 0.2931034
## [38,] 0.3275862
## [39,] 0.3275862
## [40,] 0.3275862
## [41,] 0.3103448
## [42,] 0.3620690
## [43,] 0.3275862
## [44,] 0.3103448
## [45,] 0.3103448
## [46,] 0.3103448
## [47,] 0.3275862
## [48,] 0.3103448
## [49,] 0.3275862
## [50,] 0.3103448
## [51,] 0.3620690
## [52,] 0.3103448
## [53,] 0.3103448
## [54,] 0.3448276
## [55,] 0.3448276
## [56,] 0.3275862
## [57,] 0.3620690
## [58,] 0.3275862
## [59,] 0.3620690
## [60,] 0.3448276
## [61,] 0.3620690
## [62,] 0.3620690
## [63,] 0.3275862
## [64,] 0.2758621
## [65,] 0.3275862
## [66,] 0.3448276
## [67,] 0.3448276
## [68,] 0.3275862
## [69,] 0.3793103
## [70,] 0.3620690
## [71,] 0.3793103
## [72,] 0.3448276
## [73,] 0.3448276
## [74,] 0.3275862
## [75,] 0.3448276
## [76,] 0.3620690
## [77,] 0.3448276
## [78,] 0.3620690
## [79,] 0.3275862
## [80,] 0.3448276
min(vec)
## [1] 0.2586207
df_error = data.frame(k_vec, vec)
ggplot(df_error, aes(k_vec, vec)) +geom_line()

#Extract minimum value

min_vec <- k_vec[which.min(vec)]
min_vec
## [1] 15

#K-nn model using best k value from graph

y_pred_k = knn(train = train_set %>% select(-readmitted), 
                 test = test_set %>% select(-readmitted),
                 cl = train_set$readmitted,
                 k = min_vec, prob = T)
y_pred_k
##  [1] Yes Yes No  Yes Yes No  Yes Yes Yes No  No  No  Yes Yes No  No  No  No  No 
## [20] Yes No  No  Yes No  Yes Yes Yes No  No  No  No  Yes Yes No  Yes Yes Yes No 
## [39] Yes No  Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes Yes Yes Yes Yes
## [58] Yes
## attr(,"prob")
##  [1] 0.6000000 0.5333333 0.6000000 0.6000000 0.5333333 0.7333333 0.5333333
##  [8] 0.8000000 0.6666667 0.6000000 0.7333333 0.6000000 0.6666667 0.5333333
## [15] 0.6000000 0.6000000 0.7500000 0.6666667 0.7333333 0.5333333 0.5333333
## [22] 0.6000000 0.5333333 0.6666667 0.6000000 0.8666667 0.6000000 0.5333333
## [29] 0.5333333 0.7333333 0.6666667 0.8000000 0.8666667 0.6666667 0.5333333
## [36] 0.6000000 0.6000000 0.6000000 0.5333333 0.5333333 0.8000000 0.5333333
## [43] 0.7333333 0.6666667 0.8666667 0.6000000 0.6666667 0.6666667 0.7333333
## [50] 0.6000000 0.6666667 0.6666667 0.8666667 0.6000000 0.6000000 0.8000000
## [57] 0.5333333 0.6000000
## Levels: No Yes

#Confusion matrix

confusionMatrix(test_set$readmitted, y_pred_k, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  17   6
##        Yes  9  26
##                                           
##                Accuracy : 0.7414          
##                  95% CI : (0.6096, 0.8474)
##     No Information Rate : 0.5517          
##     P-Value [Acc > NIR] : 0.002296        
##                                           
##                   Kappa : 0.4714          
##                                           
##  Mcnemar's Test P-Value : 0.605577        
##                                           
##             Sensitivity : 0.8125          
##             Specificity : 0.6538          
##          Pos Pred Value : 0.7429          
##          Neg Pred Value : 0.7391          
##              Prevalence : 0.5517          
##          Detection Rate : 0.4483          
##    Detection Prevalence : 0.6034          
##       Balanced Accuracy : 0.7332          
##                                           
##        'Positive' Class : Yes             
## 

#K-nn ROC

knn_roc <- roc(test_set$readmitted, attr(y_pred_k, 'prob'), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases

#K-nn AUC

knn_auc<- auc(knn_roc)
knn_auc
## Area under the curve: 0.5329

#Append comparison lists

Model_Names <- append(Model_Names, "K-nn")
Model_AUC_Values <- append(Model_AUC_Values, knn_auc)

XGBoost.

train_set$readmitted <- ifelse(train_set$readmitted == "Yes", 1, 0)
test_set$readmitted <- ifelse(test_set$readmitted == "Yes", 1, 0)
str(train_set$readmitted)
##  num [1:231] 1 0 0 1 1 1 1 1 0 1 ...
str(test_set$readmitted)
##  num [1:58] 1 1 1 1 1 1 1 1 0 0 ...
summary(train_set$readmitted)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.5974  1.0000  1.0000

#XGBoost model

xgb_classifier = xgboost(data = as.matrix(train_set %>% select(-readmitted)), label = train_set$readmitted, nrounds = 10)
## [1]  train-rmse:0.424124 
## [2]  train-rmse:0.372133 
## [3]  train-rmse:0.324329 
## [4]  train-rmse:0.278860 
## [5]  train-rmse:0.256454 
## [6]  train-rmse:0.229002 
## [7]  train-rmse:0.202807 
## [8]  train-rmse:0.180816 
## [9]  train-rmse:0.167502 
## [10] train-rmse:0.156990

#Prediction on test dataset

xgb_y_prob = predict(xgb_classifier, newdata = as.matrix(test_set %>% select(-readmitted)), type='prob')
head(xgb_y_prob)
## [1] 0.783411264 0.144975439 0.002184162 0.461754203 0.400559574 0.266678959
xgb_y_prob
##  [1]  0.783411264  0.144975439  0.002184162  0.461754203  0.400559574
##  [6]  0.266678959  0.584059536  0.609397054  0.834373891 -0.012347283
## [11]  0.844061494  0.275242180  0.844205260  0.669071317  0.573735237
## [16]  0.298393667  0.222442180  0.408580214  0.210570022  0.801152050
## [21]  0.354468495  0.348010361  0.968341291  0.568436027  0.717948437
## [26]  1.019045591  0.644306839  0.615441024  1.041798830  0.764605165
## [31]  0.544302821  0.879637778  0.944378018  0.223458529  0.601379514
## [36]  0.807839930  0.452889353  0.267640650  0.632538140  0.410308629
## [41]  0.785312235  0.727491140  0.139402330  0.340008676  0.841710329
## [46]  0.566241682  0.832976997  0.360108018  0.655890405  0.729740977
## [51]  0.607832670  0.334819108  0.860021234  0.499060154  0.878506184
## [56]  0.453960270  0.478604823  0.509950459

#Thresholding

xgb_y_pred = (xgb_y_prob >= 0.5)
head(xgb_y_pred)
## [1]  TRUE FALSE FALSE FALSE FALSE FALSE

#Confusion Matrix

cm = table(test_set$readmitted, xgb_y_pred)
cm
##    xgb_y_pred
##     FALSE TRUE
##   0    13   10
##   1    11   24
imp_matrix = xgb.importance(model=xgb_classifier)
imp_matrix
##                        Feature         Gain        Cover   Frequency
##  1:         num_lab_procedures 0.2689344597 0.2436158192 0.233552632
##  2:            num_medications 0.1429274797 0.1529943503 0.115131579
##  3:           time_in_hospital 0.0996285091 0.0835404896 0.148026316
##  4:           number_inpatient 0.0614227957 0.0841431262 0.055921053
##  5:       race_AfricanAmerican 0.0449953651 0.0198870056 0.019736842
##  6:              gender.Female 0.0366176337 0.0378907721 0.019736842
##  7:           number_diagnoses 0.0276272189 0.0277212806 0.042763158
##  8:             num_procedures 0.0252179684 0.0147645951 0.036184211
##  9:      admission_source_id.1 0.0212936225 0.0067796610 0.016447368
## 10:                  age_30_40 0.0191114260 0.0206403013 0.023026316
## 11: discharge_disposition_id.6 0.0177406620 0.0519020716 0.016447368
## 12:                 Other_diag 0.0170822734 0.0192090395 0.013157895
## 13:                  age_70_80 0.0164914895 0.0166478343 0.016447368
## 14:              Diabetes_diag 0.0153439623 0.0186817326 0.019736842
## 15: discharge_disposition_id.3 0.0141633244 0.0071563089 0.013157895
## 16:          max_glu_serum_300 0.0139926016 0.0095668550 0.009868421
## 17:           diabetes_med_yes 0.0136061144 0.0104708098 0.013157895
## 18:             med_change_yes 0.0130305505 0.0042184557 0.003289474
## 19:                A1Cresult_8 0.0127230970 0.0103954802 0.023026316
## 20:      admission_source_id.7 0.0102342724 0.0488135593 0.013157895
## 21:           Circulatory_diag 0.0099948213 0.0090395480 0.013157895
## 22:                  age_50_60 0.0098029727 0.0143126177 0.013157895
## 23:        admission_type_id.1 0.0091304323 0.0084369115 0.009868421
## 24:         max_glu_serum_Norm 0.0089660415 0.0067796610 0.006578947
## 25:          number_outpatient 0.0086200964 0.0127306968 0.009868421
## 26:                  age_80_90 0.0078507218 0.0143879473 0.019736842
## 27:            Respiraoty_diag 0.0074156417 0.0030131827 0.003289474
## 28: discharge_disposition_id.1 0.0071481078 0.0065536723 0.006578947
## 29:                A1Cresult_7 0.0070480323 0.0048210923 0.006578947
## 30:             race_Caucasian 0.0067382781 0.0070809793 0.009868421
## 31:                Injury_diag 0.0063631801 0.0053483992 0.006578947
## 32:          max_glu_serum_200 0.0036252880 0.0054990584 0.006578947
## 33:                  age_60_70 0.0032184927 0.0019585687 0.006578947
## 34: discharge_disposition_id.2 0.0030814088 0.0022598870 0.003289474
## 35:              race_Hispanic 0.0030310561 0.0015065913 0.003289474
## 36:                  age_40_50 0.0028365706 0.0025612053 0.009868421
## 37:             A1Cresult_Norm 0.0018955480 0.0033145009 0.006578947
## 38:         Genitourinary_diag 0.0008495848 0.0009039548 0.003289474
## 39:             Digestive_diag 0.0001988987 0.0004519774 0.003289474
##                        Feature         Gain        Cover   Frequency
xgb.plot.importance(imp_matrix)

#Xgboost ROC

xgb_roc <- roc(test_set$readmitted, xgb_y_prob, plot = T, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Xgboost AUC

xgb_auc <- auc(xgb_roc)
xgb_auc
## Area under the curve: 0.6497

#Append comparison lists

Model_Names <- append(Model_Names, "XGBoost")
Model_AUC_Values <- append(Model_AUC_Values, xgb_auc)

Models with PCA data

Logistic Regression using PCA data

#LR PCA model1

lr_pca1<- glm(readmitted~., data = train_pca, family = binomial)
summary(lr_pca1)
## 
## Call:
## glm(formula = readmitted ~ ., family = binomial, data = train_pca)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8301  -1.1676   0.7582   0.9910   1.4137  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.98875    0.25875   3.821 0.000133 ***
## PC1          0.09139    0.03660   2.497 0.012529 *  
## PC2          0.03068    0.03828   0.801 0.422918    
## PC3          0.14388    0.04300   3.346 0.000821 ***
## PC4         -0.05130    0.03538  -1.450 0.147015    
## PC5         -0.03863    0.04397  -0.879 0.379588    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 294.50  on 225  degrees of freedom
## AIC: 306.5
## 
## Number of Fisher Scoring iterations: 4

#LR PCA model2

lr_pca2 <- glm(readmitted~ PC1 + PC3, data = train_pca, family = binomial)
summary(lr_pca2)
## 
## Call:
## glm(formula = readmitted ~ PC1 + PC3, family = binomial, data = train_pca)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8601  -1.1934   0.7809   0.9935   1.4373  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.80126    0.18400   4.355 1.33e-05 ***
## PC1          0.07752    0.03412   2.272  0.02311 *  
## PC3          0.13207    0.04043   3.267  0.00109 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 297.70  on 228  degrees of freedom
## AIC: 303.7
## 
## Number of Fisher Scoring iterations: 4

#LR PCA model3

lr_pca3 <- glm(readmitted~ PC3, data = train_pca, family = binomial)
summary(lr_pca3)
## 
## Call:
## glm(formula = readmitted ~ PC3, family = binomial, data = train_pca)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7831  -1.2565   0.8426   1.0093   1.4092  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.61685    0.16002   3.855 0.000116 ***
## PC3          0.10997    0.03894   2.824 0.004747 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 311.41  on 230  degrees of freedom
## Residual deviance: 303.09  on 229  degrees of freedom
## AIC: 307.09
## 
## Number of Fisher Scoring iterations: 4

#Prediction of test pca data using LR_PCA model2 of train pca data (model with lowest AIC value)

lr_pca_predict <- predict(lr_pca2,newdata = test_pca, type = "response" )

#Thresholding to obtain class for confusion matrix

result_pca0.5 = ifelse(lr_pca_predict >=0.5, "Yes", "No")
result_pca0.5
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##  "Yes"  "Yes"   "No"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes"   "No"  "Yes" 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##   "No"  "Yes"  "Yes"  "Yes"  "Yes"   "No"   "No"  "Yes"  "Yes"  "Yes"  "Yes" 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##  "Yes"   "No"  "Yes"  "Yes"   "No"   "No"  "Yes"  "Yes"   "No"  "Yes"  "Yes" 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##   "No"  "Yes"  "Yes"  "Yes"   "No"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"   "No" 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##  "Yes"   "No"  "Yes"   "No"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes"  "Yes" 
##  84909  88479 101030 
##  "Yes"  "Yes"  "Yes"
table(test_pca$readmitted, result_pca0.5)
##      result_pca0.5
##       No Yes
##   No   8  15
##   Yes  7  28
result_pca_fac0.5 = factor(result_pca0.5)

#Confusion Matrix

confusionMatrix(test_pca$readmitted, result_pca_fac0.5)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   8  15
##        Yes  7  28
##                                           
##                Accuracy : 0.6207          
##                  95% CI : (0.4837, 0.7449)
##     No Information Rate : 0.7414          
##     P-Value [Acc > NIR] : 0.9851          
##                                           
##                   Kappa : 0.1572          
##                                           
##  Mcnemar's Test P-Value : 0.1356          
##                                           
##             Sensitivity : 0.5333          
##             Specificity : 0.6512          
##          Pos Pred Value : 0.3478          
##          Neg Pred Value : 0.8000          
##              Prevalence : 0.2586          
##          Detection Rate : 0.1379          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.5922          
##                                           
##        'Positive' Class : No              
## 

#LR PCA ROC

lr_pca_roc <-roc(test_pca$readmitted, lr_pca_predict, plot = TRUE, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#LR PCA AUC

lr_pca_auc <- auc(lr_pca_roc)
lr_pca_auc
## Area under the curve: 0.6447

#Append comparison lists

Model_Names <- append(Model_Names, "Logistic Regression PCA")
Model_AUC_Values <- append(Model_AUC_Values, lr_pca_auc)

SVM using PCA data

#SVM PCA model(kernel = linear)

classifier_pca_linear = svm(formula = readmitted~., data = train_pca, kernel = "linear")
classifier_pca_linear
## 
## Call:
## svm(formula = readmitted ~ ., data = train_pca, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  187

#Predictions on test data

y_pred_svml_pca = predict(classifier_pca_linear, newdata = test_pca)

#Confusion matrix

cm_svml_pca = table(test_pca$readmitted, y_pred_svml_pca)
cm_svml_pca
##      y_pred_svml_pca
##       No Yes
##   No   9  14
##   Yes  6  29
confusionMatrix(test_pca$readmitted, y_pred_svml_pca)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   9  14
##        Yes  6  29
##                                           
##                Accuracy : 0.6552          
##                  95% CI : (0.5188, 0.7751)
##     No Information Rate : 0.7414          
##     P-Value [Acc > NIR] : 0.9469          
##                                           
##                   Kappa : 0.2338          
##                                           
##  Mcnemar's Test P-Value : 0.1175          
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.6744          
##          Pos Pred Value : 0.3913          
##          Neg Pred Value : 0.8286          
##              Prevalence : 0.2586          
##          Detection Rate : 0.1552          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6372          
##                                           
##        'Positive' Class : No              
## 

#SVM model (kernel = radial)

classifier_radial_pca = svm(formula = readmitted~., data = train_pca, kernel = "radial")
classifier_radial_pca
## 
## Call:
## svm(formula = readmitted ~ ., data = train_pca, kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  187

#Prediction on test data

y_pred_svmr_pca = predict(classifier_radial_pca, newdata = test_pca)
y_pred_svmr_pca
##    461    824    962   1984   2309   5016   6129   6452   7698   9471  11728 
##    Yes    Yes    Yes    Yes    Yes     No     No    Yes    Yes    Yes    Yes 
##  15359  15538  17795  18511  20963  23943  25851  29060  29366  30649  31935 
##     No    Yes     No    Yes    Yes    Yes    Yes     No     No    Yes     No 
##  32130  34606  35975  36349  36685  39784  39833  40684  44259  44286  46721 
##    Yes    Yes    Yes    Yes    Yes     No    Yes    Yes     No    Yes    Yes 
##  51517  57291  57846  58435  60051  65125  66118  66395  68026  68269  68348 
##     No    Yes    Yes    Yes    Yes    Yes     No    Yes    Yes     No     No 
##  70681  71862  72980  74933  76453  76775  76814  77043  78993  79192  82660 
##    Yes     No    Yes     No    Yes    Yes    Yes    Yes    Yes    Yes    Yes 
##  84909  88479 101030 
##    Yes    Yes    Yes 
## Levels: No Yes

#Confusion matrix

cm_svmr_pca = table(test_pca$readmitted, y_pred_svmr_pca)
cm_svmr_pca
##      y_pred_svmr_pca
##       No Yes
##   No   7  16
##   Yes  8  27
confusionMatrix(test_pca$readmitted, y_pred_svmr_pca)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   7  16
##        Yes  8  27
##                                          
##                Accuracy : 0.5862         
##                  95% CI : (0.4493, 0.714)
##     No Information Rate : 0.7414         
##     P-Value [Acc > NIR] : 0.9968         
##                                          
##                   Kappa : 0.0806         
##                                          
##  Mcnemar's Test P-Value : 0.1530         
##                                          
##             Sensitivity : 0.4667         
##             Specificity : 0.6279         
##          Pos Pred Value : 0.3043         
##          Neg Pred Value : 0.7714         
##              Prevalence : 0.2586         
##          Detection Rate : 0.1207         
##    Detection Prevalence : 0.3966         
##       Balanced Accuracy : 0.5473         
##                                          
##        'Positive' Class : No             
## 

#SVM tune

svm_pca_tune <- tune(svm, train.x = train_pca %>% select(-readmitted), 
                 train.y = train_pca$readmitted, 
                 kernel = "radial", ranges = list(cost = 10^(-1:2), gamma = c(0.25, 0.5, 1,2)))
svm_pca_tune
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##   100     2
## 
## - best performance: 0.3552536

#SVM model using best cost and gamma parameters obtained after tuning.

classifier_cg_pca = svm(formula = as.factor(train_pca$readmitted)~., data = train_pca, type = "C-classification", kernel = "radial", cost = 100, gamma = 2)

#Predictions on test set

y_pred_cg_pca = predict(classifier_cg_pca, newdata = test_pca)
cm_cg_pca = table(test_pca$readmitted, y_pred_cg_pca)
cm_cg_pca
##      y_pred_cg_pca
##       No Yes
##   No   9  14
##   Yes 14  21
confusionMatrix(test_pca$readmitted, y_pred_cg_pca)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   9  14
##        Yes 14  21
##                                           
##                Accuracy : 0.5172          
##                  95% CI : (0.3822, 0.6505)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.929           
##                                           
##                   Kappa : -0.0087         
##                                           
##  Mcnemar's Test P-Value : 1.000           
##                                           
##             Sensitivity : 0.3913          
##             Specificity : 0.6000          
##          Pos Pred Value : 0.3913          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.3966          
##          Detection Rate : 0.1552          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.4957          
##                                           
##        'Positive' Class : No              
## 
summary(classifier_cg_pca)
## 
## Call:
## svm(formula = as.factor(train_pca$readmitted) ~ ., data = train_pca, 
##     type = "C-classification", kernel = "radial", cost = 100, gamma = 2)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  100 
## 
## Number of Support Vectors:  215
## 
##  ( 129 86 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  No Yes

#SVM PCA ROC

svm_pca_roc = roc(test_pca$readmitted, factor(y_pred_cg_pca, ordered = T), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#SVM PCA AUC

svm_pca_auc <- auc(svm_roc)
svm_pca_auc
## Area under the curve: 0.4857

#Append comparison lists

Model_Names <- append(Model_Names, "SVM PCA")
Model_AUC_Values <- append(Model_AUC_Values, svm_pca_auc)

K-nn using PCA data

#Determine best k value

vec_pca = c()
k_vec_pca = c()
for(k_pca in 1:80){
  y_pred_knn_pca = knn(train = train_pca %>% select(-readmitted), 
                 test = test_pca %>% select(-readmitted),
                 cl = train_pca$readmitted,
               k=k)
  error_pca = mean(y_pred_knn_pca != test_pca$readmitted)
  k_vec_pca = c(k_vec_pca, k_pca)
  vec_pca = c(vec_pca, error_pca)
}
cbind(vec_pca)
##         vec_pca
##  [1,] 0.3793103
##  [2,] 0.3448276
##  [3,] 0.3448276
##  [4,] 0.3448276
##  [5,] 0.3620690
##  [6,] 0.3793103
##  [7,] 0.3620690
##  [8,] 0.3448276
##  [9,] 0.3620690
## [10,] 0.3620690
## [11,] 0.3620690
## [12,] 0.3448276
## [13,] 0.3448276
## [14,] 0.3448276
## [15,] 0.3793103
## [16,] 0.3793103
## [17,] 0.3448276
## [18,] 0.3620690
## [19,] 0.3448276
## [20,] 0.3620690
## [21,] 0.3793103
## [22,] 0.3448276
## [23,] 0.3448276
## [24,] 0.3620690
## [25,] 0.3448276
## [26,] 0.3620690
## [27,] 0.3620690
## [28,] 0.3620690
## [29,] 0.3793103
## [30,] 0.3448276
## [31,] 0.3448276
## [32,] 0.3448276
## [33,] 0.3620690
## [34,] 0.3620690
## [35,] 0.3448276
## [36,] 0.3620690
## [37,] 0.3620690
## [38,] 0.3620690
## [39,] 0.3448276
## [40,] 0.3620690
## [41,] 0.3620690
## [42,] 0.3793103
## [43,] 0.3793103
## [44,] 0.3620690
## [45,] 0.3448276
## [46,] 0.3620690
## [47,] 0.3448276
## [48,] 0.3448276
## [49,] 0.3620690
## [50,] 0.3793103
## [51,] 0.3620690
## [52,] 0.3620690
## [53,] 0.3793103
## [54,] 0.3448276
## [55,] 0.3620690
## [56,] 0.3448276
## [57,] 0.3620690
## [58,] 0.3620690
## [59,] 0.3448276
## [60,] 0.3620690
## [61,] 0.3620690
## [62,] 0.3793103
## [63,] 0.3620690
## [64,] 0.3793103
## [65,] 0.3448276
## [66,] 0.3793103
## [67,] 0.3448276
## [68,] 0.3620690
## [69,] 0.3620690
## [70,] 0.3793103
## [71,] 0.3620690
## [72,] 0.3793103
## [73,] 0.3448276
## [74,] 0.3448276
## [75,] 0.3620690
## [76,] 0.3620690
## [77,] 0.3620690
## [78,] 0.3448276
## [79,] 0.3620690
## [80,] 0.3793103
min(vec_pca)
## [1] 0.3448276
df_error_pca = data.frame(k_vec_pca, vec_pca)
ggplot(df_error_pca, aes(k_vec_pca, vec_pca)) +geom_line()

#Extract minimum value

min_vec_pca <- k_vec_pca[which.min(vec_pca)]
min_vec_pca
## [1] 2

#K-nn model using best k value from graph

y_pred_k_pca = knn(train = train_pca %>% select(-readmitted), 
                 test = test_pca %>% select(-readmitted),
                 cl = train_pca$readmitted,
                 k = min_vec_pca, prob = T)
y_pred_k_pca
##  [1] Yes Yes No  Yes Yes No  No  Yes Yes No  No  No  Yes Yes Yes No  Yes No  No 
## [20] No  No  No  Yes No  Yes Yes Yes No  Yes Yes No  Yes Yes No  No  No  Yes Yes
## [39] Yes No  No  No  No  Yes Yes No  No  No  No  Yes Yes No  Yes Yes No  Yes No 
## [58] Yes
## attr(,"prob")
##  [1] 1.0 1.0 1.0 0.5 1.0 0.5 1.0 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.5 1.0 0.5 0.5
## [20] 0.5 1.0 0.5 1.0 1.0 1.0 1.0 0.5 1.0 1.0 0.5 1.0 1.0 1.0 1.0 0.5 0.5 1.0 0.5
## [39] 0.5 1.0 0.5 0.5 0.5 1.0 1.0 1.0 0.5 1.0 0.5 1.0 1.0 0.5 1.0 1.0 0.5 1.0 0.5
## [58] 1.0
## Levels: No Yes

#Confusion matrix

confusionMatrix(test_pca$readmitted, y_pred_k_pca, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  13  10
##        Yes 16  19
##                                           
##                Accuracy : 0.5517          
##                  95% CI : (0.4154, 0.6826)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.2559          
##                                           
##                   Kappa : 0.1034          
##                                           
##  Mcnemar's Test P-Value : 0.3268          
##                                           
##             Sensitivity : 0.6552          
##             Specificity : 0.4483          
##          Pos Pred Value : 0.5429          
##          Neg Pred Value : 0.5652          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3276          
##    Detection Prevalence : 0.6034          
##       Balanced Accuracy : 0.5517          
##                                           
##        'Positive' Class : Yes             
## 

#K-nn PCA ROC

knn_pca_roc<- roc(test_pca$readmitted, attr(y_pred_k_pca, 'prob'), plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#K-nn PCA AUC

knn_pca_auc<- auc(knn_pca_roc)
knn_pca_auc
## Area under the curve: 0.546

#Append comparison lists

Model_Names <- append(Model_Names, "K-nn PCA")
Model_AUC_Values <- append(Model_AUC_Values, knn_pca_auc)

Cross Validation

train_set$readmitted <- ifelse(train_set$readmitted == 1, "Yes", "No")
test_set$readmitted <- ifelse(test_set$readmitted == 1, "Yes", "No")
train_set$readmitted <- as.factor(train_set$readmitted)
test_set$readmitted <- as.factor(test_set$readmitted)
str(train_set$readmitted)
##  Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 2 2 1 2 ...

Logistic Regression Cross Validation.

train_control <- trainControl(method = "cv",
                              number = 10,
                              classProbs = TRUE,
                              summaryFunction = twoClassSummary)
lr_cv = train(readmitted ~., data = train_set,
              method = "glm",
              metric="ROC",
              tuneLength = 10,
              trControl = train_control)
lr_cv
## Generalized Linear Model 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 208, 207, 208, 209, 208, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6508852  0.4911111  0.6978022
lr_cv$results
##   parameter       ROC      Sens      Spec     ROCSD   SensSD    SpecSD
## 1      none 0.6508852 0.4911111 0.6978022 0.1158645 0.203252 0.1317154

#Prediction of test data

lr_pred <- predict(lr_cv, test_set)
lr_pred
##  [1] No  No  No  Yes Yes No  Yes Yes Yes No  Yes No  Yes No  No  No  No  No  No 
## [20] Yes Yes No  Yes Yes No  Yes Yes No  Yes No  No  Yes Yes No  Yes No  Yes No 
## [39] Yes No  Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes No  Yes Yes Yes
## [58] Yes
## Levels: No Yes

#Obtain probabilities

lr_pred_prob <- predict(lr_cv, test_set, type = "prob")
lr_pred_prob
##                 No        Yes
## 461    0.675463352 0.32453665
## 824    0.947745309 0.05225469
## 962    0.826163335 0.17383667
## 1984   0.169275517 0.83072448
## 2309   0.259408539 0.74059146
## 5016   0.949737666 0.05026233
## 6129   0.091369048 0.90863095
## 6452   0.072585298 0.92741470
## 7698   0.079277404 0.92072260
## 9471   0.905648871 0.09435113
## 11728  0.240533615 0.75946638
## 15359  0.852050046 0.14794995
## 15538  0.247903978 0.75209602
## 17795  0.662634009 0.33736599
## 18511  0.510548178 0.48945182
## 20963  0.872472799 0.12752720
## 23943  0.733487687 0.26651231
## 25851  0.775771173 0.22422883
## 29060  0.516374021 0.48362598
## 29366  0.222229905 0.77777009
## 30649  0.102321559 0.89767844
## 31935  0.649185909 0.35081409
## 32130  0.085387095 0.91461290
## 34606  0.474587212 0.52541279
## 35975  0.505571313 0.49442869
## 36349  0.041702525 0.95829747
## 36685  0.157486296 0.84251370
## 39784  0.910804537 0.08919546
## 39833  0.288054914 0.71194509
## 40684  0.722441783 0.27755822
## 44259  0.561480202 0.43851980
## 44286  0.102238276 0.89776172
## 46721  0.207769314 0.79223069
## 51517  0.694022254 0.30597775
## 57291  0.453205894 0.54679411
## 57846  0.620038250 0.37996175
## 58435  0.036410839 0.96358916
## 60051  0.796717131 0.20328287
## 65125  0.039615248 0.96038475
## 66118  0.811508992 0.18849101
## 66395  0.284824737 0.71517526
## 68026  0.324725038 0.67527496
## 68269  0.622240232 0.37775977
## 68348  0.836533531 0.16346647
## 70681  0.046481004 0.95351900
## 71862  0.859933847 0.14006615
## 72980  0.008226799 0.99177320
## 74933  0.819798396 0.18020160
## 76453  0.681474155 0.31852585
## 76775  0.132532454 0.86746755
## 76814  0.178155427 0.82184457
## 77043  0.539392430 0.46060757
## 78993  0.167075654 0.83292435
## 79192  0.766071686 0.23392831
## 82660  0.113143933 0.88685607
## 84909  0.380328135 0.61967187
## 88479  0.484415642 0.51558436
## 101030 0.421614305 0.57838569
str(test_set$readmitted)
##  Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...

#Confusion Matrix

lr_cv_cm <- confusionMatrix(lr_pred, test_set$readmitted)
lr_cv_cm 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  16  12
##        Yes  7  23
##                                           
##                Accuracy : 0.6724          
##                  95% CI : (0.5366, 0.7899)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.1741          
##                                           
##                   Kappa : 0.3401          
##                                           
##  Mcnemar's Test P-Value : 0.3588          
##                                           
##             Sensitivity : 0.6957          
##             Specificity : 0.6571          
##          Pos Pred Value : 0.5714          
##          Neg Pred Value : 0.7667          
##              Prevalence : 0.3966          
##          Detection Rate : 0.2759          
##    Detection Prevalence : 0.4828          
##       Balanced Accuracy : 0.6764          
##                                           
##        'Positive' Class : No              
## 

#LR CV ROC

lr_cv_roc <- roc(test_set$readmitted, lr_pred_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#LR CV AUC

lr_cv_auc <- auc(lr_cv_roc)
lr_cv_auc
## Area under the curve: 0.7267

#Append comparison lists

Model_Names <- append(Model_Names, "Logistic Regression Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, lr_cv_auc)

Elastic Net Cross Validation.

glmnet_cv= train(readmitted ~., data = train_set,
                  method = "glmnet",
                  metric="ROC",
                  trControl = train_control)
glmnet_cv
## glmnet 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 209, 207, 207, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        ROC        Sens       Spec     
##   0.10   0.0002334386  0.6260501  0.4977778  0.7197802
##   0.10   0.0023343864  0.6254640  0.4977778  0.7269231
##   0.10   0.0233438645  0.6481502  0.5200000  0.7483516
##   0.55   0.0002334386  0.6246215  0.5088889  0.7197802
##   0.55   0.0023343864  0.6307143  0.5088889  0.7192308
##   0.55   0.0233438645  0.6723138  0.5077778  0.7917582
##   1.00   0.0002334386  0.6261905  0.5088889  0.7197802
##   1.00   0.0023343864  0.6330220  0.5088889  0.7192308
##   1.00   0.0233438645  0.6845849  0.4633333  0.8274725
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02334386.

#Best parameters

glmnet_cv$bestTune
##   alpha     lambda
## 9     1 0.02334386
glmnet_cv$results
##   alpha       lambda       ROC      Sens      Spec     ROCSD    SensSD
## 1  0.10 0.0002334386 0.6260501 0.4977778 0.7197802 0.1547494 0.1952663
## 2  0.10 0.0023343864 0.6254640 0.4977778 0.7269231 0.1510252 0.2088442
## 3  0.10 0.0233438645 0.6481502 0.5200000 0.7483516 0.1347680 0.1941391
## 4  0.55 0.0002334386 0.6246215 0.5088889 0.7197802 0.1573322 0.1950555
## 5  0.55 0.0023343864 0.6307143 0.5088889 0.7192308 0.1476213 0.2151211
## 6  0.55 0.0233438645 0.6723138 0.5077778 0.7917582 0.1170483 0.1773954
## 7  1.00 0.0002334386 0.6261905 0.5088889 0.7197802 0.1567712 0.1950555
## 8  1.00 0.0023343864 0.6330220 0.5088889 0.7192308 0.1384531 0.2086471
## 9  1.00 0.0233438645 0.6845849 0.4633333 0.8274725 0.1203069 0.1041064
##       SpecSD
## 1 0.13017843
## 2 0.13602647
## 3 0.12526187
## 4 0.13017843
## 5 0.12546656
## 6 0.10192271
## 7 0.13017843
## 8 0.12546656
## 9 0.08901928

#Prediction on test set

glmnet_cv_pred <- predict(glmnet_cv, test_set)
glmnet_cv_pred
##  [1] No  No  No  Yes Yes No  Yes Yes Yes No  Yes No  Yes Yes Yes No  No  No  No 
## [20] Yes Yes No  Yes Yes Yes Yes Yes No  Yes Yes Yes Yes Yes No  Yes Yes Yes No 
## [39] Yes No  Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes No  Yes Yes Yes
## [58] Yes
## Levels: No Yes

#Obtain probabilities

glmnet_cv_prob <- predict(glmnet_cv, test_set, type = "prob")
glmnet_cv_prob
##            No       Yes
## 1  0.57996883 0.4200312
## 2  0.69385962 0.3061404
## 3  0.66739281 0.3326072
## 4  0.28402550 0.7159745
## 5  0.35378848 0.6462115
## 6  0.71592726 0.2840727
## 7  0.21716228 0.7828377
## 8  0.25824862 0.7417514
## 9  0.24884354 0.7511565
## 10 0.74807346 0.2519265
## 11 0.27305866 0.7269413
## 12 0.72822752 0.2717725
## 13 0.34313393 0.6568661
## 14 0.45496872 0.5450313
## 15 0.45709405 0.5429059
## 16 0.64555475 0.3544452
## 17 0.65923921 0.3407608
## 18 0.55242522 0.4475748
## 19 0.51727656 0.4827234
## 20 0.37414121 0.6258588
## 21 0.31512401 0.6848760
## 22 0.50138927 0.4986107
## 23 0.15751985 0.8424801
## 24 0.49643791 0.5035621
## 25 0.44967050 0.5503295
## 26 0.15088027 0.8491197
## 27 0.30778563 0.6922144
## 28 0.73282226 0.2671777
## 29 0.22831025 0.7716897
## 30 0.49690211 0.5030979
## 31 0.44851411 0.5514859
## 32 0.23211403 0.7678860
## 33 0.36042008 0.6395799
## 34 0.59755350 0.4024465
## 35 0.34510891 0.6548911
## 36 0.48775666 0.5122433
## 37 0.18883683 0.8111632
## 38 0.60933876 0.3906612
## 39 0.13194910 0.8680509
## 40 0.64238675 0.3576133
## 41 0.29023603 0.7097640
## 42 0.39919973 0.6008003
## 43 0.54721781 0.4527822
## 44 0.66832706 0.3316729
## 45 0.13933073 0.8606693
## 46 0.72818275 0.2718172
## 47 0.06732332 0.9326767
## 48 0.72355879 0.2764412
## 49 0.54083040 0.4591696
## 50 0.25356765 0.7464324
## 51 0.26435849 0.7356415
## 52 0.53782960 0.4621704
## 53 0.24163410 0.7583659
## 54 0.61688131 0.3831187
## 55 0.27494688 0.7250531
## 56 0.33167862 0.6683214
## 57 0.42167999 0.5783200
## 58 0.42138281 0.5786172

#Confusion matrix

glmnet_cm <-confusionMatrix(glmnet_cv_pred, test_set$readmitted)
glmnet_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  14   8
##        Yes  9  27
##                                           
##                Accuracy : 0.7069          
##                  95% CI : (0.5727, 0.8191)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.06804         
##                                           
##                   Kappa : 0.383           
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.6087          
##             Specificity : 0.7714          
##          Pos Pred Value : 0.6364          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.3966          
##          Detection Rate : 0.2414          
##    Detection Prevalence : 0.3793          
##       Balanced Accuracy : 0.6901          
##                                           
##        'Positive' Class : No              
## 

#ElasticNet CV ROC

glmnet_cv_roc <- roc(test_set$readmitted, glmnet_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#ElasticNet CV AUC

glmnet_cv_auc <- auc(glmnet_cv_roc)
glmnet_cv_auc
## Area under the curve: 0.6994

#Append comparison lists

Model_Names <- append(Model_Names, "GLMNet Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, glmnet_cv_auc)

SVM Cross Validation.

SVM Radial Cross Validation

svmr_cv <- train(readmitted ~., data = train_set, 
               method = "svmRadial",
               metric="ROC",
               trControl = train_control,
               preProcess = c("center", "scale")) 
svmr_cv
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (43), scaled (43) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 209, 207, 208, 208, 208, 209, ... 
## Resampling results across tuning parameters:
## 
##   C     ROC        Sens       Spec     
##   0.25  0.6391514  0.3755556  0.8115385
##   0.50  0.6407387  0.3955556  0.8192308
##   1.00  0.6263004  0.2555556  0.8917582
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01305887
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01305887 and C = 0.5.

#Best Tuned parameters

svmr_cv$bestTune
##        sigma   C
## 2 0.01305887 0.5
svmr_cv$results
##        sigma    C       ROC      Sens      Spec      ROCSD    SensSD     SpecSD
## 1 0.01305887 0.25 0.6391514 0.3755556 0.8115385 0.08099193 0.1242591 0.09054549
## 2 0.01305887 0.50 0.6407387 0.3955556 0.8192308 0.07913294 0.1525458 0.08981645
## 3 0.01305887 1.00 0.6263004 0.2555556 0.8917582 0.09661604 0.1296825 0.08476708

#Prediction on test set

svmr_cv_pred <- predict(svmr_cv, test_set)
svmr_cv_pred
##  [1] No  No  No  Yes Yes No  Yes Yes Yes No  Yes No  Yes No  No  No  No  No  No 
## [20] No  Yes No  Yes No  Yes Yes Yes No  Yes No  No  Yes Yes No  Yes No  Yes No 
## [39] Yes No  Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes No  Yes Yes No 
## [58] Yes
## Levels: No Yes

#Obtain probabilities

svmr_cv_prob <- predict(svmr_cv, test_set, type = "prob")
svmr_cv_prob
##           No       Yes
## 1  0.5769947 0.4230053
## 2  0.5763076 0.4236924
## 3  0.6838934 0.3161066
## 4  0.3620863 0.6379137
## 5  0.3604649 0.6395351
## 6  0.7759164 0.2240836
## 7  0.3191321 0.6808679
## 8  0.2468887 0.7531113
## 9  0.2749433 0.7250567
## 10 0.6577235 0.3422765
## 11 0.4207962 0.5792038
## 12 0.6907564 0.3092436
## 13 0.2942528 0.7057472
## 14 0.5898328 0.4101672
## 15 0.5464387 0.4535613
## 16 0.5535083 0.4464917
## 17 0.6567153 0.3432847
## 18 0.6750684 0.3249316
## 19 0.6916219 0.3083781
## 20 0.5711105 0.4288895
## 21 0.4754318 0.5245682
## 22 0.5327080 0.4672920
## 23 0.1956482 0.8043518
## 24 0.5560474 0.4439526
## 25 0.3911894 0.6088106
## 26 0.1567603 0.8432397
## 27 0.2860177 0.7139823
## 28 0.7485627 0.2514373
## 29 0.3758562 0.6241438
## 30 0.5429425 0.4570575
## 31 0.6103655 0.3896345
## 32 0.2453747 0.7546253
## 33 0.3488706 0.6511294
## 34 0.7199452 0.2800548
## 35 0.4371471 0.5628529
## 36 0.5192925 0.4807075
## 37 0.2393439 0.7606561
## 38 0.7468459 0.2531541
## 39 0.2314607 0.7685393
## 40 0.7429177 0.2570823
## 41 0.3175687 0.6824313
## 42 0.2736421 0.7263579
## 43 0.6667483 0.3332517
## 44 0.7371417 0.2628583
## 45 0.1959194 0.8040806
## 46 0.5686266 0.4313734
## 47 0.2391227 0.7608773
## 48 0.8925172 0.1074828
## 49 0.6096774 0.3903226
## 50 0.3288979 0.6711021
## 51 0.4003295 0.5996705
## 52 0.6296448 0.3703552
## 53 0.2260528 0.7739472
## 54 0.5816134 0.4183866
## 55 0.2401978 0.7598022
## 56 0.3795123 0.6204877
## 57 0.5588951 0.4411049
## 58 0.4904895 0.5095105

#Confusion matrix

svmr_cv_cm <- confusionMatrix(svmr_cv_pred, test_set$readmitted)
svmr_cv_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  17  13
##        Yes  6  22
##                                           
##                Accuracy : 0.6724          
##                  95% CI : (0.5366, 0.7899)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.1741          
##                                           
##                   Kappa : 0.3495          
##                                           
##  Mcnemar's Test P-Value : 0.1687          
##                                           
##             Sensitivity : 0.7391          
##             Specificity : 0.6286          
##          Pos Pred Value : 0.5667          
##          Neg Pred Value : 0.7857          
##              Prevalence : 0.3966          
##          Detection Rate : 0.2931          
##    Detection Prevalence : 0.5172          
##       Balanced Accuracy : 0.6839          
##                                           
##        'Positive' Class : No              
## 

#SVM (radial) Cross Validation ROC

svmr_cv_roc <- roc(test_set$readmitted, svmr_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#SVM (radial) Cross Validation AUC

svmr_cv_auc <- auc(svmr_cv_roc)
svmr_cv_auc 
## Area under the curve: 0.6969

#Append comparison lists

Model_Names <- append(Model_Names, "SVM (radial) Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, svmr_cv_auc)

SVM Cross Validation

svml_cv <- train(readmitted ~., data = train_set, 
               method = "svmLinear",
               metric="ROC",
               trControl = train_control,
               preProcess = c("center", "scale")) 
svml_cv
## Support Vector Machines with Linear Kernel 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (43), scaled (43) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 209, 208, 207, 208, 207, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6521795  0.1811111  0.9274725
## 
## Tuning parameter 'C' was held constant at a value of 1

#Best Tuned parameters

svml_cv$bestTune
##   C
## 1 1
svml_cv$results
##   C       ROC      Sens      Spec     ROCSD   SensSD     SpecSD
## 1 1 0.6521795 0.1811111 0.9274725 0.1350173 0.136992 0.06738334

#Prediction on test set

svml_cv_pred <- predict(svml_cv, test_set)
svml_cv_pred
##  [1] Yes Yes Yes Yes Yes No  Yes Yes Yes No  Yes Yes Yes Yes Yes Yes Yes Yes Yes
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No  Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
## [39] Yes Yes Yes Yes Yes Yes Yes No  Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
## [58] Yes
## Levels: No Yes

#Obtain probabilities

svml_cv_prob <- predict(svml_cv, test_set, type = "prob")
svml_cv_prob
##           No       Yes
## 1  0.4431314 0.5568686
## 2  0.4982408 0.5017592
## 3  0.4658163 0.5341837
## 4  0.3745219 0.6254781
## 5  0.3916940 0.6083060
## 6  0.5326142 0.4673858
## 7  0.3481217 0.6518783
## 8  0.3292040 0.6707960
## 9  0.3273604 0.6726396
## 10 0.5068224 0.4931776
## 11 0.3854271 0.6145729
## 12 0.4896421 0.5103579
## 13 0.3754099 0.6245901
## 14 0.4779784 0.5220216
## 15 0.4172089 0.5827911
## 16 0.4946418 0.5053582
## 17 0.4586103 0.5413897
## 18 0.4617907 0.5382093
## 19 0.4237267 0.5762733
## 20 0.3822677 0.6177323
## 21 0.3485692 0.6514308
## 22 0.4436891 0.5563109
## 23 0.3484069 0.6515931
## 24 0.4349139 0.5650861
## 25 0.4190052 0.5809948
## 26 0.3149672 0.6850328
## 27 0.3710468 0.6289532
## 28 0.5028279 0.4971721
## 29 0.3956450 0.6043550
## 30 0.4497018 0.5502982
## 31 0.4340911 0.5659089
## 32 0.3384045 0.6615955
## 33 0.3827347 0.6172653
## 34 0.4596682 0.5403318
## 35 0.4408209 0.5591791
## 36 0.4302368 0.5697632
## 37 0.3019393 0.6980607
## 38 0.4608635 0.5391365
## 39 0.3269750 0.6730250
## 40 0.4914957 0.5085043
## 41 0.4084308 0.5915692
## 42 0.3931231 0.6068769
## 43 0.4812504 0.5187496
## 44 0.4959837 0.5040163
## 45 0.3294059 0.6705941
## 46 0.5113351 0.4886649
## 47 0.3086960 0.6913040
## 48 0.4904065 0.5095935
## 49 0.4636852 0.5363148
## 50 0.3507997 0.6492003
## 51 0.3872374 0.6127626
## 52 0.4344101 0.5655899
## 53 0.3982462 0.6017538
## 54 0.4672895 0.5327105
## 55 0.3509703 0.6490297
## 56 0.4172474 0.5827526
## 57 0.4279689 0.5720311
## 58 0.4244512 0.5755488

#Confusion matrix

svml_cv_cm <- confusionMatrix(svml_cv_pred, test_set$readmitted)
svml_cv_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No   3   1
##        Yes 20  34
##                                           
##                Accuracy : 0.6379          
##                  95% CI : (0.5012, 0.7601)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.3467          
##                                           
##                   Kappa : 0.1187          
##                                           
##  Mcnemar's Test P-Value : 8.568e-05       
##                                           
##             Sensitivity : 0.13043         
##             Specificity : 0.97143         
##          Pos Pred Value : 0.75000         
##          Neg Pred Value : 0.62963         
##              Prevalence : 0.39655         
##          Detection Rate : 0.05172         
##    Detection Prevalence : 0.06897         
##       Balanced Accuracy : 0.55093         
##                                           
##        'Positive' Class : No              
## 

#SVM (linear) Cross Validation ROC

svml_cv_roc <- roc(test_set$readmitted, svml_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#SVM (linear) Cross Validation AUC

svml_cv_auc <- auc(svml_cv_roc)
svml_cv_auc 
## Area under the curve: 0.7155

#Append comparison lists

Model_Names <- append(Model_Names, "SVM (linear) Cross Validation ")
Model_AUC_Values  <- append(Model_AUC_Values, svml_cv_auc)

K-nn Cross Validation.

#Model

knn_cv = train(readmitted ~., data = train_set,
               method = "knn",
               metric="ROC",
               trControl = train_control)
knn_cv
## k-Nearest Neighbors 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 207, 209, 208, 207, 207, ... 
## Resampling results across tuning parameters:
## 
##   k  ROC        Sens       Spec     
##   5  0.5697833  0.4288889  0.6164835
##   7  0.5864866  0.4622222  0.6741758
##   9  0.6021215  0.4400000  0.6450549
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
knn_cv$bestTune
##   k
## 3 9
knn_cv$results
##   k       ROC      Sens      Spec     ROCSD    SensSD     SpecSD
## 1 5 0.5697833 0.4288889 0.6164835 0.1133132 0.2236497 0.11640631
## 2 7 0.5864866 0.4622222 0.6741758 0.1210569 0.2062933 0.08920185
## 3 9 0.6021215 0.4400000 0.6450549 0.1208528 0.2272397 0.09103135

#K-nn cv prediction on test set

knn_cv_pred <- predict(knn_cv, test_set)
knn_cv_pred
##  [1] No  Yes No  Yes Yes No  Yes Yes Yes No  No  No  Yes No  No  No  No  No  No 
## [20] No  Yes No  No  No  Yes Yes Yes No  Yes No  No  Yes Yes No  No  Yes Yes No 
## [39] Yes No  Yes Yes No  No  Yes Yes Yes No  No  Yes Yes No  Yes Yes Yes Yes No 
## [58] Yes
## Levels: No Yes

#Obtain probabilities

knn_cv_prob <- predict(knn_cv, test_set, type = "prob")
knn_cv_prob
##           No       Yes
## 1  0.5555556 0.4444444
## 2  0.4444444 0.5555556
## 3  0.6666667 0.3333333
## 4  0.3333333 0.6666667
## 5  0.4444444 0.5555556
## 6  0.7777778 0.2222222
## 7  0.3333333 0.6666667
## 8  0.1111111 0.8888889
## 9  0.3333333 0.6666667
## 10 0.6666667 0.3333333
## 11 0.7777778 0.2222222
## 12 0.5555556 0.4444444
## 13 0.3333333 0.6666667
## 14 0.6666667 0.3333333
## 15 0.5555556 0.4444444
## 16 0.5555556 0.4444444
## 17 0.7777778 0.2222222
## 18 0.6666667 0.3333333
## 19 0.7777778 0.2222222
## 20 0.6666667 0.3333333
## 21 0.4444444 0.5555556
## 22 0.5555556 0.4444444
## 23 0.5555556 0.4444444
## 24 0.5555556 0.4444444
## 25 0.4444444 0.5555556
## 26 0.2222222 0.7777778
## 27 0.2222222 0.7777778
## 28 0.5555556 0.4444444
## 29 0.4444444 0.5555556
## 30 1.0000000 0.0000000
## 31 0.6666667 0.3333333
## 32 0.1111111 0.8888889
## 33 0.1111111 0.8888889
## 34 0.6666667 0.3333333
## 35 0.6666667 0.3333333
## 36 0.4444444 0.5555556
## 37 0.3333333 0.6666667
## 38 0.6666667 0.3333333
## 39 0.4444444 0.5555556
## 40 0.6666667 0.3333333
## 41 0.3333333 0.6666667
## 42 0.3333333 0.6666667
## 43 0.6666667 0.3333333
## 44 0.5555556 0.4444444
## 45 0.1111111 0.8888889
## 46 0.4444444 0.5555556
## 47 0.3333333 0.6666667
## 48 0.8888889 0.1111111
## 49 0.8888889 0.1111111
## 50 0.4444444 0.5555556
## 51 0.0000000 1.0000000
## 52 0.7777778 0.2222222
## 53 0.1111111 0.8888889
## 54 0.4444444 0.5555556
## 55 0.4444444 0.5555556
## 56 0.2222222 0.7777778
## 57 0.5555556 0.4444444
## 58 0.2222222 0.7777778

#Confusion matrix

knn_cv_cm <- confusionMatrix(knn_cv_pred, test_set$readmitted)
knn_cv_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  16  13
##        Yes  7  22
##                                           
##                Accuracy : 0.6552          
##                  95% CI : (0.5188, 0.7751)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.2529          
##                                           
##                   Kappa : 0.3103          
##                                           
##  Mcnemar's Test P-Value : 0.2636          
##                                           
##             Sensitivity : 0.6957          
##             Specificity : 0.6286          
##          Pos Pred Value : 0.5517          
##          Neg Pred Value : 0.7586          
##              Prevalence : 0.3966          
##          Detection Rate : 0.2759          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.6621          
##                                           
##        'Positive' Class : No              
## 

#K-nn Cross Validation ROC

knn_cv_roc <- roc(test_set$readmitted, knn_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#K-nn Cross Validation AUC

knn_cv_auc <- auc(knn_cv_roc)
knn_cv_auc
## Area under the curve: 0.646

#Append comparison lists

Model_Names <- append(Model_Names, "K-nn Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, knn_cv_auc)

Decision Tree Cross Validation.

#Model

dt_cv <- train(readmitted ~., data = train_set,
               method = "rpart",
               metric = "ROC",
               trControl = train_control)
dt_cv
## CART 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 207, 208, 208, 208, 208, ... 
## Resampling results across tuning parameters:
## 
##   cp          ROC        Sens       Spec     
##   0.02867384  0.5675061  0.3544444  0.7247253
##   0.03225806  0.5661935  0.2900000  0.7620879
##   0.17204301  0.5346825  0.1622222  0.9071429
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02867384.
dt_cv$results
##           cp       ROC      Sens      Spec      ROCSD    SensSD    SpecSD
## 1 0.02867384 0.5675061 0.3544444 0.7247253 0.08394223 0.1232226 0.1262395
## 2 0.03225806 0.5661935 0.2900000 0.7620879 0.07959142 0.1492046 0.1289630
## 3 0.17204301 0.5346825 0.1622222 0.9071429 0.05329365 0.2242011 0.1390362

#Best tune parameters

dt_cv$bestTune
##           cp
## 1 0.02867384

#Prediction on test

dt_cv_pred <- predict(dt_cv, test_set)
dt_cv_pred
##  [1] No  No  No  No  Yes No  Yes Yes Yes No  No  No  Yes Yes No  No  No  No  No 
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No  Yes Yes Yes Yes Yes Yes Yes Yes Yes No 
## [39] Yes No  Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes No  Yes Yes Yes
## [58] Yes
## Levels: No Yes

#Obtain probabilities

dt_cv_prob <- predict(dt_cv, test_set, type = "prob")
dt_cv_prob
##               No       Yes
## 461    0.8571429 0.1428571
## 824    0.6363636 0.3636364
## 962    0.6176471 0.3823529
## 1984   0.6176471 0.3823529
## 2309   0.1612903 0.8387097
## 5016   0.8571429 0.1428571
## 6129   0.1612903 0.8387097
## 6452   0.2745098 0.7254902
## 7698   0.2800000 0.7200000
## 9471   0.8571429 0.1428571
## 11728  0.6176471 0.3823529
## 15359  0.6176471 0.3823529
## 15538  0.2800000 0.7200000
## 17795  0.1612903 0.8387097
## 18511  0.6176471 0.3823529
## 20963  0.6176471 0.3823529
## 23943  0.6176471 0.3823529
## 25851  0.6176471 0.3823529
## 29060  1.0000000 0.0000000
## 29366  0.1612903 0.8387097
## 30649  0.2745098 0.7254902
## 31935  0.2745098 0.7254902
## 32130  0.1612903 0.8387097
## 34606  0.2745098 0.7254902
## 35975  0.2800000 0.7200000
## 36349  0.1612903 0.8387097
## 36685  0.2745098 0.7254902
## 39784  0.6176471 0.3823529
## 39833  0.1612903 0.8387097
## 40684  0.2745098 0.7254902
## 44259  0.2745098 0.7254902
## 44286  0.1612903 0.8387097
## 46721  0.2800000 0.7200000
## 51517  0.2745098 0.7254902
## 57291  0.1612903 0.8387097
## 57846  0.1612903 0.8387097
## 58435  0.1612903 0.8387097
## 60051  0.6176471 0.3823529
## 65125  0.1612903 0.8387097
## 66118  0.8571429 0.1428571
## 66395  0.2800000 0.7200000
## 68026  0.2745098 0.7254902
## 68269  0.6176471 0.3823529
## 68348  0.6176471 0.3823529
## 70681  0.1612903 0.8387097
## 71862  0.8571429 0.1428571
## 72980  0.1612903 0.8387097
## 74933  0.6176471 0.3823529
## 76453  0.6176471 0.3823529
## 76775  0.2745098 0.7254902
## 76814  0.2745098 0.7254902
## 77043  0.6176471 0.3823529
## 78993  0.2800000 0.7200000
## 79192  0.6176471 0.3823529
## 82660  0.1612903 0.8387097
## 84909  0.2800000 0.7200000
## 88479  0.2745098 0.7254902
## 101030 0.2800000 0.7200000

#DT CV confusion matrix

dt_cv_cm <- confusionMatrix(dt_cv_pred, test_set$readmitted)
dt_cv_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  13  10
##        Yes 10  25
##                                           
##                Accuracy : 0.6552          
##                  95% CI : (0.5188, 0.7751)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.2529          
##                                           
##                   Kappa : 0.2795          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5652          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5652          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.3966          
##          Detection Rate : 0.2241          
##    Detection Prevalence : 0.3966          
##       Balanced Accuracy : 0.6398          
##                                           
##        'Positive' Class : No              
## 

#DT CV ROC

dt_cv_roc <- roc(test_set$readmitted, dt_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#DT CV AUC

dt_cv_auc <- auc(dt_cv_roc)
dt_cv_auc
## Area under the curve: 0.7081

#Append comparison lists

Model_Names <- append(Model_Names, "Decision Tree Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, dt_cv_auc)

Random Forest Cross Validation.

#Model

rf_cv <- train(readmitted~., data = train_set, 
               method = "rf",
               metric = "ROC",
               trControl = train_control)
rf_cv
## Random Forest 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 207, 208, 208, 209, 208, 207, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.6322436  0.2477778  0.8543956
##   22    0.6576374  0.3955556  0.7609890
##   43    0.6564713  0.3966667  0.7313187
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 22.
rf_cv$results 
##   mtry       ROC      Sens      Spec      ROCSD     SensSD     SpecSD
## 1    2 0.6322436 0.2477778 0.8543956 0.03614236 0.05291243 0.07723858
## 2   22 0.6576374 0.3955556 0.7609890 0.06861013 0.14327159 0.07614512
## 3   43 0.6564713 0.3966667 0.7313187 0.08186449 0.13011022 0.06126367
rf_cv$bestTune
##   mtry
## 2   22

#RF CV prediction on test data

rf_cv_pred <- predict(rf_cv, test_set)
rf_cv_pred
##  [1] Yes No  No  Yes Yes No  Yes Yes Yes No  No  No  Yes Yes Yes No  No  No  Yes
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No  Yes Yes Yes Yes Yes No  Yes Yes Yes No 
## [39] Yes Yes Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes No  Yes Yes Yes
## [58] Yes
## Levels: No Yes

#Obtain probabilities

rf_cv_prob <- predict(rf_cv, test_set, type = "prob")
rf_cv_prob
##           No   Yes
## 461    0.312 0.688
## 824    0.684 0.316
## 962    0.780 0.220
## 1984   0.398 0.602
## 2309   0.360 0.640
## 5016   0.734 0.266
## 6129   0.250 0.750
## 6452   0.374 0.626
## 7698   0.218 0.782
## 9471   0.784 0.216
## 11728  0.504 0.496
## 15359  0.556 0.444
## 15538  0.204 0.796
## 17795  0.278 0.722
## 18511  0.498 0.502
## 20963  0.736 0.264
## 23943  0.764 0.236
## 25851  0.618 0.382
## 29060  0.460 0.540
## 29366  0.372 0.628
## 30649  0.424 0.576
## 31935  0.462 0.538
## 32130  0.118 0.882
## 34606  0.462 0.538
## 35975  0.258 0.742
## 36349  0.074 0.926
## 36685  0.208 0.792
## 39784  0.708 0.292
## 39833  0.118 0.882
## 40684  0.438 0.562
## 44259  0.384 0.616
## 44286  0.204 0.796
## 46721  0.338 0.662
## 51517  0.576 0.424
## 57291  0.464 0.536
## 57846  0.376 0.624
## 58435  0.300 0.700
## 60051  0.640 0.360
## 65125  0.168 0.832
## 66118  0.494 0.506
## 66395  0.332 0.668
## 68026  0.272 0.728
## 68269  0.830 0.170
## 68348  0.666 0.334
## 70681  0.170 0.830
## 71862  0.644 0.356
## 72980  0.096 0.904
## 74933  0.838 0.162
## 76453  0.620 0.380
## 76775  0.306 0.694
## 76814  0.350 0.650
## 77043  0.546 0.454
## 78993  0.142 0.858
## 79192  0.512 0.488
## 82660  0.358 0.642
## 84909  0.388 0.612
## 88479  0.448 0.552
## 101030 0.314 0.686

#RF CV confusion matrix

rf_cv_cm <- confusionMatrix(rf_cv_pred, test_set$readmitted)
rf_cv_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  11   8
##        Yes 12  27
##                                           
##                Accuracy : 0.6552          
##                  95% CI : (0.5188, 0.7751)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.2529          
##                                           
##                   Kappa : 0.2574          
##                                           
##  Mcnemar's Test P-Value : 0.5023          
##                                           
##             Sensitivity : 0.4783          
##             Specificity : 0.7714          
##          Pos Pred Value : 0.5789          
##          Neg Pred Value : 0.6923          
##              Prevalence : 0.3966          
##          Detection Rate : 0.1897          
##    Detection Prevalence : 0.3276          
##       Balanced Accuracy : 0.6248          
##                                           
##        'Positive' Class : No              
## 

#RF CV ROC

rf_cv_roc <- roc(test_set$readmitted, rf_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#RF CV AUC

rf_cv_auc <- auc(dt_cv_roc)
rf_cv_auc
## Area under the curve: 0.7081

#Append comparison lists

Model_Names <- append(Model_Names, "Random Forest Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, rf_cv_auc)

XGBoost Cross Validation

#Setting hyperparameters

xgb_grid = expand.grid(nrounds = 100,
                       eta = 0.3,
                       max_depth = 1,
                       gamma = 0,
                       colsample_bytree = 0.6,
                       min_child_weight = 1,
                       subsample = 0.5)

#Model

xgb_cv <- train(readmitted~., data = train_variance,
                method = "xgbTree",
                metric = "ROC",
                tuneGrid = xgb_grid,
                trControl = train_control)
xgb_cv
## eXtreme Gradient Boosting 
## 
## 231 samples
##  43 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 208, 207, 208, 209, 208, 208, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6592857  0.4966667  0.7032967
## 
## Tuning parameter 'nrounds' was held constant at a value of 100
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 0.5
xgb_cv$results
##   nrounds eta max_depth gamma colsample_bytree min_child_weight subsample
## 1     100 0.3         1     0              0.6                1       0.5
##         ROC      Sens      Spec     ROCSD    SensSD    SpecSD
## 1 0.6592857 0.4966667 0.7032967 0.1259415 0.2499959 0.1437466
xgb_cv$bestTune
##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1     100         1 0.3     0              0.6                1       0.5

#Prediction of test set

xgb_cv_pred <- predict(xgb_cv, test_variance)
xgb_cv_pred
##  [1] Yes No  No  Yes Yes No  Yes Yes Yes No  Yes No  Yes Yes No  No  No  No  No 
## [20] Yes Yes Yes Yes Yes Yes Yes Yes No  Yes No  No  Yes Yes Yes Yes Yes Yes No 
## [39] Yes No  Yes Yes No  No  Yes No  Yes No  No  Yes Yes No  Yes No  Yes Yes Yes
## [58] Yes
## Levels: No Yes

#Obtain probabilities

xgb_cv_prob <- predict(xgb_cv, test_variance, type = "prob")
xgb_cv_prob
##            No        Yes
## 1  0.32917005 0.67082995
## 2  0.86825436 0.13174564
## 3  0.82599509 0.17400491
## 4  0.24610515 0.75389485
## 5  0.21723588 0.78276412
## 6  0.92182052 0.07817948
## 7  0.04456132 0.95543868
## 8  0.22743192 0.77256808
## 9  0.09605937 0.90394063
## 10 0.90615952 0.09384048
## 11 0.11220569 0.88779431
## 12 0.92583340 0.07416660
## 13 0.37088314 0.62911686
## 14 0.19849150 0.80150850
## 15 0.58358234 0.41641766
## 16 0.90069807 0.09930193
## 17 0.85694236 0.14305764
## 18 0.80964696 0.19035304
## 19 0.52580488 0.47419512
## 20 0.30859825 0.69140175
## 21 0.17238551 0.82761449
## 22 0.31953821 0.68046179
## 23 0.12107700 0.87892300
## 24 0.30053511 0.69946489
## 25 0.25890693 0.74109307
## 26 0.06948786 0.93051214
## 27 0.31062853 0.68937147
## 28 0.86036950 0.13963050
## 29 0.31823820 0.68176180
## 30 0.51464057 0.48535943
## 31 0.68005693 0.31994307
## 32 0.30210975 0.69789025
## 33 0.30628887 0.69371113
## 34 0.42075387 0.57924613
## 35 0.28659305 0.71340695
## 36 0.49482372 0.50517628
## 37 0.08175056 0.91824944
## 38 0.66953462 0.33046538
## 39 0.03546740 0.96453260
## 40 0.71574247 0.28425753
## 41 0.41602308 0.58397692
## 42 0.32529932 0.67470068
## 43 0.71841925 0.28158075
## 44 0.72792310 0.27207690
## 45 0.05762353 0.94237647
## 46 0.74586344 0.25413656
## 47 0.18621323 0.81378677
## 48 0.91335452 0.08664548
## 49 0.68886632 0.31113368
## 50 0.09414817 0.90585183
## 51 0.14021052 0.85978948
## 52 0.51479864 0.48520136
## 53 0.21080305 0.78919695
## 54 0.74617946 0.25382054
## 55 0.14976101 0.85023899
## 56 0.34436220 0.65563780
## 57 0.45173427 0.54826573
## 58 0.28236154 0.71763846

#XGB CV confusion matrix

xgb_cv_cm <- confusionMatrix(xgb_cv_pred, test_variance$readmitted, positive = "Yes")
xgb_cv_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  14   8
##        Yes  9  27
##                                           
##                Accuracy : 0.7069          
##                  95% CI : (0.5727, 0.8191)
##     No Information Rate : 0.6034          
##     P-Value [Acc > NIR] : 0.06804         
##                                           
##                   Kappa : 0.383           
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.7714          
##             Specificity : 0.6087          
##          Pos Pred Value : 0.7500          
##          Neg Pred Value : 0.6364          
##              Prevalence : 0.6034          
##          Detection Rate : 0.4655          
##    Detection Prevalence : 0.6207          
##       Balanced Accuracy : 0.6901          
##                                           
##        'Positive' Class : Yes             
## 

XGB CV ROC

xgb_cv_roc <- roc(test_variance$readmitted, xgb_cv_prob$Yes, plot = T, print.auc = TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

#XGB CV AUC

xgb_cv_auc <- auc(xgb_cv_roc)
xgb_cv_auc
## Area under the curve: 0.7155

#Append comparison lists

Model_Names <- append(Model_Names, "XGBoost Cross Validation")
Model_AUC_Values  <- append(Model_AUC_Values, xgb_cv_auc)

Model Comparison

Model AUC table

model_compare <- data.frame(Model_Names, Model_AUC_Values)
model_compare
##                             Model_Names Model_AUC_Values
## 1                   Logistic Regression        0.7378882
## 2                                GLMNet        0.7093168
## 3                         Decision Tree        0.6260870
## 4                                   SVM        0.4857143
## 5                                  K-nn        0.5329193
## 6                               XGBoost        0.6496894
## 7               Logistic Regression PCA        0.6447205
## 8                               SVM PCA        0.4857143
## 9                              K-nn PCA        0.5459627
## 10 Logistic Regression Cross Validation        0.7267081
## 11              GLMNet Cross Validation        0.6993789
## 12        SVM (radial) Cross Validation        0.6968944
## 13       SVM (linear) Cross Validation         0.7155280
## 14                K-nn Cross Validation        0.6459627
## 15       Decision Tree Cross Validation        0.7080745
## 16       Random Forest Cross Validation        0.7080745
## 17             XGBoost Cross Validation        0.7155280

Models ROC plots

log_reg_plot = plot(lr_roc,  col = "#ce7e00")
text(x = 1.2, y = 0.2, labels = paste("AUC =", round(auc(lr_roc), 2)), col = "#ce7e00")
dt_plot = plot(dt_roc, add = TRUE, col = "#ffbf00")
text(x = 1.2, y = 0.3, labels = paste("AUC =", round(auc(dt_roc), 2)), col = "#ffbf00")
rf_plot = plot(rf_roc, add = TRUE, col = "#0b5394")
text(x = 1.2, y = 0.4, labels = paste("AUC =", round(auc(rf_roc), 2)), col = "#0b5394")
svm_plot = plot(svm_roc, add = TRUE, col = "#741b47") 
text(x = 1.2, y = 0.5, labels = paste("AUC =", round(auc(svm_roc), 2)), col = "#741b47")
knn_plot = plot(knn_roc, add = TRUE, col = "#38761d") 
text(x = 1.2, y = 0.6, labels = paste("AUC =", round(auc(knn_roc), 2)), col = "#38761d")
xgb_plot = plot(xgb_roc, add= TRUE, col = "#1ecdc9")
text(x = 1.2, y = 0.7, labels = paste("AUC =", round(auc(xgb_roc), 2)), col = "#1ecdc9")
glmnet_plot = plot(glmnet_roc, add = TRUE, col = "#ff95be")
text(x = 1.2, y = 0.8, labels = paste("AUC =", round(auc(glmnet_roc), 2)), col = "#ff95be")

PCA Model ROC Plots

log_reg_pca_plot = plot(lr_pca_roc,  col = "#ce7e00")
text(x = 1.2, y = 0.2, labels = paste("AUC =", round(auc(lr_pca_roc), 2)), col = "#ce7e00")
svm_pca_plot = plot(svm_pca_roc, add = TRUE, col = "#741b47")  
text(x = 1.2, y = 0.4, labels = paste("AUC =", round(auc(svm_pca_roc), 2)), col = "#741b47")
knn_pca_plot = plot(knn_pca_roc, add = TRUE, col = "#38761d") 
text(x = 1.2, y = 0.6, labels = paste("AUC =", round(auc(knn_pca_roc), 2)), col = "#38761d")

CV Model ROC Plots

lr_cv_plot = plot(lr_cv_roc,  col = "#4d982c")
text(x = 1.2, y = 0.2, labels = paste("AUC =", round(auc(lr_cv_roc), 2)), col = "#4d982c")
dt_cv_plot = plot(dt_cv_roc, add = TRUE, col = "#ffbf00")
text(x = 1.2, y = 0.3, labels = paste("AUC =", round(auc(dt_cv_roc), 2)), col = "#ffbf00")
rf_cv_plot = plot(rf_cv_roc, add = TRUE, col = "#0b5394")
text(x = 1.2, y = 0.4, labels = paste("AUC =", round(auc(rf_cv_roc), 2)), col = "#0b5394")
svmr_cv_plot = plot(svmr_cv_roc, add = TRUE, col = "#741b47") 
text(x = 1.2, y = 0.5, labels = paste("AUC =", round(auc(svmr_cv_roc), 2)), col = "#741b47")
svml_cv_plot = plot(svml_cv_roc, add = TRUE, col = "#a23ef3") 
text(x = 1.2, y = 0.6, labels = paste("AUC =", round(auc(svml_cv_roc), 2)), col = "#a23ef3")
knn_cv_plot = plot(knn_cv_roc, add = TRUE, col = "#38761d") 
text(x = 1.2, y = 0.7, labels = paste("AUC =", round(auc(knn_cv_roc), 2)), col = "#38761d")
xgb_cv_plot = plot(xgb_cv_roc, add= TRUE, col = "#1ecdc9")
text(x = 1.2, y = 0.8, labels = paste("AUC =", round(auc(xgb_cv_roc), 2)), col = "#1ecdc9")
glmnet_cv_plot = plot(glmnet_cv_roc, add = TRUE, col = "#ce7e00")
text(x = 1.2, y = 0.9, labels = paste("AUC =", round(auc(glmnet_cv_roc), 2)), col = "#ce7e00")

Performance Summary The aim of project was to predict hospital readmission based on several patient related predictors. The data-set contained missing/invalid observations which were removed as the data-set contained relatively large number of observations. The information from all factor predictor variables was incorporated in the data-set by creating their dummy variables. Thus final data-set comprised of all originally numeric + dummy variables and contained 44 predictors and 289 observations . Unsupervised techniques; Hierarchical and K-means Clustering were used to visualize grouping of similar data. Dimensionality Reduction was performed using PCA feature extraction technique. The PCA data and expanded data sets were then used for performance evaluation using several Supervised Learning Algorithms and Cross Validation techniques. All model performances were compared using the AUC metric. Based on outcomes of all models above, for prediction of readmission, Logistic Regression Model with time_in_hospital,num_lab_procedures,number_inpatient and Other_diag as significant predictors was deemed to be the best performing model at AUC of 74%.

R Markdown